blob: 0281c93a6ecc65115ce98547546b425783dda704 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
Martin v. Löwis5cb69362006-04-14 09:08:42 +000039#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000040#include "Python.h"
41
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Fredrik Lundhb63588c2006-05-23 18:44:25 +000049#undef USE_INLINE /* XXX - set via configure? */
50
51#if defined(_MSC_VER) /* this is taken from _sre.c */
52#pragma warning(disable: 4710)
53/* fastest possible local call under MSVC */
54#define LOCAL(type) static __inline type __fastcall
55#elif defined(USE_INLINE)
56#define LOCAL(type) static inline type
57#else
58#define LOCAL(type) static type
59#endif
60
Guido van Rossumd57fd912000-03-10 22:53:23 +000061/* Limit for the Unicode object free list */
62
63#define MAX_UNICODE_FREELIST_SIZE 1024
64
65/* Limit for the Unicode object free list stay alive optimization.
66
67 The implementation will keep allocated Unicode memory intact for
68 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000069 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
Barry Warsaw51ac5802000-03-20 16:36:48 +000071 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000072 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000073 malloc()-overhead) bytes of unused garbage.
74
75 Setting the limit to 0 effectively turns the feature off.
76
Guido van Rossumfd4b9572000-04-10 13:51:10 +000077 Note: This is an experimental feature ! If you get core dumps when
78 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000079
80*/
81
Guido van Rossumfd4b9572000-04-10 13:51:10 +000082#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000083
84/* Endianness switches; defaults to little endian */
85
86#ifdef WORDS_BIGENDIAN
87# define BYTEORDER_IS_BIG_ENDIAN
88#else
89# define BYTEORDER_IS_LITTLE_ENDIAN
90#endif
91
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000092/* --- Globals ------------------------------------------------------------
93
94 The globals are initialized by the _PyUnicode_Init() API and should
95 not be used before calling that API.
96
97*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Anthony Baxterac6bd462006-04-13 02:06:09 +000099
100#ifdef __cplusplus
101extern "C" {
102#endif
103
Guido van Rossumd57fd912000-03-10 22:53:23 +0000104/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000105static PyUnicodeObject *unicode_freelist;
106static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000107
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000108/* The empty Unicode object is shared to improve performance. */
109static PyUnicodeObject *unicode_empty;
110
111/* Single character Unicode strings in the Latin-1 range are being
112 shared as well. */
113static PyUnicodeObject *unicode_latin1[256];
114
Fred Drakee4315f52000-05-09 19:53:39 +0000115/* Default encoding to use and assume when NULL is passed as encoding
116 parameter; it is initialized by _PyUnicode_Init().
117
118 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000119 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000120
121*/
Fred Drakee4315f52000-05-09 19:53:39 +0000122static char unicode_default_encoding[100];
123
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000124Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000125PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000126{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000127#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000128 return 0x10FFFF;
129#else
130 /* This is actually an illegal character, so it should
131 not be passed to unichr. */
132 return 0xFFFF;
133#endif
134}
135
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000136/* --- Bloom Filters ----------------------------------------------------- */
137
138/* stuff to implement simple "bloom filters" for Unicode characters.
139 to keep things simple, we use a single bitmask, using the least 5
140 bits from each unicode characters as the bit index. */
141
142/* the linebreak mask is set up by Unicode_Init below */
143
144#define BLOOM_MASK unsigned long
145
146static BLOOM_MASK bloom_linebreak;
147
148#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
149
150#define BLOOM_LINEBREAK(ch)\
151 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
152
153LOCAL(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
154{
155 /* calculate simple bloom-style bitmask for a given unicode string */
156
157 long mask;
158 Py_ssize_t i;
159
160 mask = 0;
161 for (i = 0; i < len; i++)
162 mask |= (1 << (ptr[i] & 0x1F));
163
164 return mask;
165}
166
167LOCAL(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
168{
169 Py_ssize_t i;
170
171 for (i = 0; i < setlen; i++)
172 if (set[i] == chr)
173 return 1;
174
Fredrik Lundh77633512006-05-23 19:47:35 +0000175 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000176}
177
178#define BLOOM_MEMBER(mask, chr, set, setlen)\
179 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
180
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181/* --- Unicode Object ----------------------------------------------------- */
182
183static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000185 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000186{
187 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000188
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000189 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000191 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 /* Resizing shared object (unicode_empty or single character
194 objects) in-place is not allowed. Use PyUnicode_Resize()
195 instead ! */
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000196 if (unicode == unicode_empty ||
197 (unicode->length == 1 &&
198 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 return -1;
203 }
204
205 /* We allocate one more byte to make sure the string is
206 Ux0000 terminated -- XXX is this needed ? */
207 oldstr = unicode->str;
208 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
209 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000210 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 PyErr_NoMemory();
212 return -1;
213 }
214 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000215 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000217 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000219 if (unicode->defenc) {
220 Py_DECREF(unicode->defenc);
221 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000222 }
223 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000224
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 return 0;
226}
227
228/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000229 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230
231 XXX This allocator could further be enhanced by assuring that the
232 free list never reduces its size below 1.
233
234*/
235
236static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000237PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238{
239 register PyUnicodeObject *unicode;
240
Tim Petersced69f82003-09-16 20:30:58 +0000241 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 if (length == 0 && unicode_empty != NULL) {
243 Py_INCREF(unicode_empty);
244 return unicode_empty;
245 }
246
247 /* Unicode freelist & memory allocation */
248 if (unicode_freelist) {
249 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000250 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 /* Keep-Alive optimization: we only upsize the buffer,
254 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000255 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000256 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000257 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000258 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 }
260 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000261 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000263 }
264 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265 }
266 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000267 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268 if (unicode == NULL)
269 return NULL;
270 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
271 }
272
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000273 if (!unicode->str) {
274 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000275 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000276 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000277 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000278 * the caller fails before initializing str -- unicode_resize()
279 * reads str[0], and the Keep-Alive optimization can keep memory
280 * allocated for str alive across a call to unicode_dealloc(unicode).
281 * We don't want unicode_resize to read uninitialized memory in
282 * that case.
283 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000284 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000285 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000286 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000288 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000290
291 onError:
292 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000293 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000294 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000295}
296
297static
Guido van Rossum9475a232001-10-05 20:51:39 +0000298void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000300 if (PyUnicode_CheckExact(unicode) &&
301 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 /* Keep-Alive optimization */
303 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000304 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 unicode->str = NULL;
306 unicode->length = 0;
307 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000308 if (unicode->defenc) {
309 Py_DECREF(unicode->defenc);
310 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000311 }
312 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313 *(PyUnicodeObject **)unicode = unicode_freelist;
314 unicode_freelist = unicode;
315 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000318 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000319 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000320 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321 }
322}
323
Martin v. Löwis18e16552006-02-15 17:27:45 +0000324int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000325{
326 register PyUnicodeObject *v;
327
328 /* Argument checks */
329 if (unicode == NULL) {
330 PyErr_BadInternalCall();
331 return -1;
332 }
333 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000334 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000335 PyErr_BadInternalCall();
336 return -1;
337 }
338
339 /* Resizing unicode_empty and single character objects is not
340 possible since these are being shared. We simply return a fresh
341 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000342 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000343 (v == unicode_empty || v->length == 1)) {
344 PyUnicodeObject *w = _PyUnicode_New(length);
345 if (w == NULL)
346 return -1;
347 Py_UNICODE_COPY(w->str, v->str,
348 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000349 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000350 *unicode = (PyObject *)w;
351 return 0;
352 }
353
354 /* Note that we don't have to modify *unicode for unshared Unicode
355 objects, since we can modify them in-place. */
356 return unicode_resize(v, length);
357}
358
359/* Internal API for use in unicodeobject.c only ! */
360#define _PyUnicode_Resize(unicodevar, length) \
361 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
362
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000364 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365{
366 PyUnicodeObject *unicode;
367
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000368 /* If the Unicode data is known at construction time, we can apply
369 some optimizations which share commonly used objects. */
370 if (u != NULL) {
371
372 /* Optimization for empty strings */
373 if (size == 0 && unicode_empty != NULL) {
374 Py_INCREF(unicode_empty);
375 return (PyObject *)unicode_empty;
376 }
377
378 /* Single character Unicode objects in the Latin-1 range are
379 shared when using this constructor */
380 if (size == 1 && *u < 256) {
381 unicode = unicode_latin1[*u];
382 if (!unicode) {
383 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000384 if (!unicode)
385 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000386 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000387 unicode_latin1[*u] = unicode;
388 }
389 Py_INCREF(unicode);
390 return (PyObject *)unicode;
391 }
392 }
Tim Petersced69f82003-09-16 20:30:58 +0000393
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394 unicode = _PyUnicode_New(size);
395 if (!unicode)
396 return NULL;
397
398 /* Copy the Unicode data into the new object */
399 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000400 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401
402 return (PyObject *)unicode;
403}
404
405#ifdef HAVE_WCHAR_H
406
407PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000408 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000409{
410 PyUnicodeObject *unicode;
411
412 if (w == NULL) {
413 PyErr_BadInternalCall();
414 return NULL;
415 }
416
417 unicode = _PyUnicode_New(size);
418 if (!unicode)
419 return NULL;
420
421 /* Copy the wchar_t data into the new object */
422#ifdef HAVE_USABLE_WCHAR_T
423 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000424#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 {
426 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000427 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000429 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430 *u++ = *w++;
431 }
432#endif
433
434 return (PyObject *)unicode;
435}
436
Martin v. Löwis18e16552006-02-15 17:27:45 +0000437Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
438 wchar_t *w,
439 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440{
441 if (unicode == NULL) {
442 PyErr_BadInternalCall();
443 return -1;
444 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000445
446 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000447 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000448 size = PyUnicode_GET_SIZE(unicode) + 1;
449
Guido van Rossumd57fd912000-03-10 22:53:23 +0000450#ifdef HAVE_USABLE_WCHAR_T
451 memcpy(w, unicode->str, size * sizeof(wchar_t));
452#else
453 {
454 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000455 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000457 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000458 *w++ = *u++;
459 }
460#endif
461
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000462 if (size > PyUnicode_GET_SIZE(unicode))
463 return PyUnicode_GET_SIZE(unicode);
464 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000465 return size;
466}
467
468#endif
469
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000470PyObject *PyUnicode_FromOrdinal(int ordinal)
471{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000472 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000473
474#ifdef Py_UNICODE_WIDE
475 if (ordinal < 0 || ordinal > 0x10ffff) {
476 PyErr_SetString(PyExc_ValueError,
477 "unichr() arg not in range(0x110000) "
478 "(wide Python build)");
479 return NULL;
480 }
481#else
482 if (ordinal < 0 || ordinal > 0xffff) {
483 PyErr_SetString(PyExc_ValueError,
484 "unichr() arg not in range(0x10000) "
485 "(narrow Python build)");
486 return NULL;
487 }
488#endif
489
Hye-Shik Chang40574832004-04-06 07:24:51 +0000490 s[0] = (Py_UNICODE)ordinal;
491 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000492}
493
Guido van Rossumd57fd912000-03-10 22:53:23 +0000494PyObject *PyUnicode_FromObject(register PyObject *obj)
495{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000496 /* XXX Perhaps we should make this API an alias of
497 PyObject_Unicode() instead ?! */
498 if (PyUnicode_CheckExact(obj)) {
499 Py_INCREF(obj);
500 return obj;
501 }
502 if (PyUnicode_Check(obj)) {
503 /* For a Unicode subtype that's not a Unicode object,
504 return a true Unicode object with the same data. */
505 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
506 PyUnicode_GET_SIZE(obj));
507 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
509}
510
511PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
512 const char *encoding,
513 const char *errors)
514{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000515 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000516 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000517 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000518
Guido van Rossumd57fd912000-03-10 22:53:23 +0000519 if (obj == NULL) {
520 PyErr_BadInternalCall();
521 return NULL;
522 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000523
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000524#if 0
525 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000526 that no encodings is given and then redirect to
527 PyObject_Unicode() which then applies the additional logic for
528 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000530 NOTE: This API should really only be used for object which
531 represent *encoded* Unicode !
532
533 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 if (PyUnicode_Check(obj)) {
535 if (encoding) {
536 PyErr_SetString(PyExc_TypeError,
537 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000538 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000539 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000540 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000541 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000542#else
543 if (PyUnicode_Check(obj)) {
544 PyErr_SetString(PyExc_TypeError,
545 "decoding Unicode is not supported");
546 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000547 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000548#endif
549
550 /* Coerce object */
551 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000552 s = PyString_AS_STRING(obj);
553 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
556 /* Overwrite the error message with something more useful in
557 case of a TypeError. */
558 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000559 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000560 "coercing to Unicode: need string or buffer, "
561 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000562 obj->ob_type->tp_name);
563 goto onError;
564 }
Tim Petersced69f82003-09-16 20:30:58 +0000565
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000566 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567 if (len == 0) {
568 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000570 }
Tim Petersced69f82003-09-16 20:30:58 +0000571 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000573
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000574 return v;
575
576 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000577 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000578}
579
580PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000581 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582 const char *encoding,
583 const char *errors)
584{
585 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000586
587 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000588 encoding = PyUnicode_GetDefaultEncoding();
589
590 /* Shortcuts for common default encodings */
591 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000592 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000593 else if (strcmp(encoding, "latin-1") == 0)
594 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000595#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
596 else if (strcmp(encoding, "mbcs") == 0)
597 return PyUnicode_DecodeMBCS(s, size, errors);
598#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000599 else if (strcmp(encoding, "ascii") == 0)
600 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000601
602 /* Decode via the codec registry */
603 buffer = PyBuffer_FromMemory((void *)s, size);
604 if (buffer == NULL)
605 goto onError;
606 unicode = PyCodec_Decode(buffer, encoding, errors);
607 if (unicode == NULL)
608 goto onError;
609 if (!PyUnicode_Check(unicode)) {
610 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000611 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612 unicode->ob_type->tp_name);
613 Py_DECREF(unicode);
614 goto onError;
615 }
616 Py_DECREF(buffer);
617 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000618
Guido van Rossumd57fd912000-03-10 22:53:23 +0000619 onError:
620 Py_XDECREF(buffer);
621 return NULL;
622}
623
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000624PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
625 const char *encoding,
626 const char *errors)
627{
628 PyObject *v;
629
630 if (!PyUnicode_Check(unicode)) {
631 PyErr_BadArgument();
632 goto onError;
633 }
634
635 if (encoding == NULL)
636 encoding = PyUnicode_GetDefaultEncoding();
637
638 /* Decode via the codec registry */
639 v = PyCodec_Decode(unicode, encoding, errors);
640 if (v == NULL)
641 goto onError;
642 return v;
643
644 onError:
645 return NULL;
646}
647
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000649 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 const char *encoding,
651 const char *errors)
652{
653 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000654
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 unicode = PyUnicode_FromUnicode(s, size);
656 if (unicode == NULL)
657 return NULL;
658 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
659 Py_DECREF(unicode);
660 return v;
661}
662
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000663PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
664 const char *encoding,
665 const char *errors)
666{
667 PyObject *v;
668
669 if (!PyUnicode_Check(unicode)) {
670 PyErr_BadArgument();
671 goto onError;
672 }
673
674 if (encoding == NULL)
675 encoding = PyUnicode_GetDefaultEncoding();
676
677 /* Encode via the codec registry */
678 v = PyCodec_Encode(unicode, encoding, errors);
679 if (v == NULL)
680 goto onError;
681 return v;
682
683 onError:
684 return NULL;
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
688 const char *encoding,
689 const char *errors)
690{
691 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000692
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693 if (!PyUnicode_Check(unicode)) {
694 PyErr_BadArgument();
695 goto onError;
696 }
Fred Drakee4315f52000-05-09 19:53:39 +0000697
Tim Petersced69f82003-09-16 20:30:58 +0000698 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000699 encoding = PyUnicode_GetDefaultEncoding();
700
701 /* Shortcuts for common default encodings */
702 if (errors == NULL) {
703 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000704 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000705 else if (strcmp(encoding, "latin-1") == 0)
706 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000707#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
708 else if (strcmp(encoding, "mbcs") == 0)
709 return PyUnicode_AsMBCSString(unicode);
710#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000711 else if (strcmp(encoding, "ascii") == 0)
712 return PyUnicode_AsASCIIString(unicode);
713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714
715 /* Encode via the codec registry */
716 v = PyCodec_Encode(unicode, encoding, errors);
717 if (v == NULL)
718 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000719 if (!PyString_Check(v)) {
720 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000721 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000722 v->ob_type->tp_name);
723 Py_DECREF(v);
724 goto onError;
725 }
726 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000727
Guido van Rossumd57fd912000-03-10 22:53:23 +0000728 onError:
729 return NULL;
730}
731
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000732PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
733 const char *errors)
734{
735 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
736
737 if (v)
738 return v;
739 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
740 if (v && errors == NULL)
741 ((PyUnicodeObject *)unicode)->defenc = v;
742 return v;
743}
744
Guido van Rossumd57fd912000-03-10 22:53:23 +0000745Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
746{
747 if (!PyUnicode_Check(unicode)) {
748 PyErr_BadArgument();
749 goto onError;
750 }
751 return PyUnicode_AS_UNICODE(unicode);
752
753 onError:
754 return NULL;
755}
756
Martin v. Löwis18e16552006-02-15 17:27:45 +0000757Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000758{
759 if (!PyUnicode_Check(unicode)) {
760 PyErr_BadArgument();
761 goto onError;
762 }
763 return PyUnicode_GET_SIZE(unicode);
764
765 onError:
766 return -1;
767}
768
Thomas Wouters78890102000-07-22 19:25:51 +0000769const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000770{
771 return unicode_default_encoding;
772}
773
774int PyUnicode_SetDefaultEncoding(const char *encoding)
775{
776 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000777
Fred Drakee4315f52000-05-09 19:53:39 +0000778 /* Make sure the encoding is valid. As side effect, this also
779 loads the encoding into the codec registry cache. */
780 v = _PyCodec_Lookup(encoding);
781 if (v == NULL)
782 goto onError;
783 Py_DECREF(v);
784 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000785 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000786 sizeof(unicode_default_encoding));
787 return 0;
788
789 onError:
790 return -1;
791}
792
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000793/* error handling callback helper:
794 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000795 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000796 and adjust various state variables.
797 return 0 on success, -1 on error
798*/
799
800static
801int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
802 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000803 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
804 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000805{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000806 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000807
808 PyObject *restuple = NULL;
809 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000810 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
811 Py_ssize_t requiredsize;
812 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000813 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000814 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000815 int res = -1;
816
817 if (*errorHandler == NULL) {
818 *errorHandler = PyCodec_LookupError(errors);
819 if (*errorHandler == NULL)
820 goto onError;
821 }
822
823 if (*exceptionObject == NULL) {
824 *exceptionObject = PyUnicodeDecodeError_Create(
825 encoding, input, insize, *startinpos, *endinpos, reason);
826 if (*exceptionObject == NULL)
827 goto onError;
828 }
829 else {
830 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
831 goto onError;
832 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
833 goto onError;
834 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
835 goto onError;
836 }
837
838 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
839 if (restuple == NULL)
840 goto onError;
841 if (!PyTuple_Check(restuple)) {
842 PyErr_Format(PyExc_TypeError, &argparse[4]);
843 goto onError;
844 }
845 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
846 goto onError;
847 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000848 newpos = insize+newpos;
849 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000850 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000851 goto onError;
852 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000853
854 /* need more space? (at least enough for what we
855 have+the replacement+the rest of the string (starting
856 at the new input position), so we won't have to check space
857 when there are no errors in the rest of the string) */
858 repptr = PyUnicode_AS_UNICODE(repunicode);
859 repsize = PyUnicode_GET_SIZE(repunicode);
860 requiredsize = *outpos + repsize + insize-newpos;
861 if (requiredsize > outsize) {
862 if (requiredsize<2*outsize)
863 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000864 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000865 goto onError;
866 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
867 }
868 *endinpos = newpos;
869 *inptr = input + newpos;
870 Py_UNICODE_COPY(*outptr, repptr, repsize);
871 *outptr += repsize;
872 *outpos += repsize;
873 /* we made it! */
874 res = 0;
875
876 onError:
877 Py_XDECREF(restuple);
878 return res;
879}
880
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881/* --- UTF-7 Codec -------------------------------------------------------- */
882
883/* see RFC2152 for details */
884
Tim Petersced69f82003-09-16 20:30:58 +0000885static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000886char utf7_special[128] = {
887 /* indicate whether a UTF-7 character is special i.e. cannot be directly
888 encoded:
889 0 - not special
890 1 - special
891 2 - whitespace (optional)
892 3 - RFC2152 Set O (optional) */
893 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
894 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
895 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
897 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
898 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
899 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
901
902};
903
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000904/* Note: The comparison (c) <= 0 is a trick to work-around gcc
905 warnings about the comparison always being false; since
906 utf7_special[0] is 1, we can safely make that one comparison
907 true */
908
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000909#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000910 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000911 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000912 (encodeO && (utf7_special[(c)] == 3)))
913
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000914#define B64(n) \
915 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
916#define B64CHAR(c) \
917 (isalnum(c) || (c) == '+' || (c) == '/')
918#define UB64(c) \
919 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
920 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000922#define ENCODE(out, ch, bits) \
923 while (bits >= 6) { \
924 *out++ = B64(ch >> (bits-6)); \
925 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000926 }
927
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000928#define DECODE(out, ch, bits, surrogate) \
929 while (bits >= 16) { \
930 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
931 bits -= 16; \
932 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000933 /* We have already generated an error for the high surrogate \
934 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000935 surrogate = 0; \
936 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000937 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000938 it in a 16-bit character */ \
939 surrogate = 1; \
940 errmsg = "code pairs are not supported"; \
941 goto utf7Error; \
942 } else { \
943 *out++ = outCh; \
944 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000945 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000946
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000947PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000948 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 const char *errors)
950{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000951 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000952 Py_ssize_t startinpos;
953 Py_ssize_t endinpos;
954 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000955 const char *e;
956 PyUnicodeObject *unicode;
957 Py_UNICODE *p;
958 const char *errmsg = "";
959 int inShift = 0;
960 unsigned int bitsleft = 0;
961 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000962 int surrogate = 0;
963 PyObject *errorHandler = NULL;
964 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965
966 unicode = _PyUnicode_New(size);
967 if (!unicode)
968 return NULL;
969 if (size == 0)
970 return (PyObject *)unicode;
971
972 p = unicode->str;
973 e = s + size;
974
975 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000976 Py_UNICODE ch;
977 restart:
978 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000979
980 if (inShift) {
981 if ((ch == '-') || !B64CHAR(ch)) {
982 inShift = 0;
983 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000984
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000985 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
986 if (bitsleft >= 6) {
987 /* The shift sequence has a partial character in it. If
988 bitsleft < 6 then we could just classify it as padding
989 but that is not the case here */
990
991 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000992 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000993 }
994 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000995 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000996 here so indicate the potential of a misencoded character. */
997
998 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
999 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1000 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001001 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 }
1003
1004 if (ch == '-') {
1005 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001006 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 inShift = 1;
1008 }
1009 } else if (SPECIAL(ch,0,0)) {
1010 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001011 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001012 } else {
1013 *p++ = ch;
1014 }
1015 } else {
1016 charsleft = (charsleft << 6) | UB64(ch);
1017 bitsleft += 6;
1018 s++;
1019 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1020 }
1021 }
1022 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001023 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001024 s++;
1025 if (s < e && *s == '-') {
1026 s++;
1027 *p++ = '+';
1028 } else
1029 {
1030 inShift = 1;
1031 bitsleft = 0;
1032 }
1033 }
1034 else if (SPECIAL(ch,0,0)) {
1035 errmsg = "unexpected special character";
1036 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001037 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001038 }
1039 else {
1040 *p++ = ch;
1041 s++;
1042 }
1043 continue;
1044 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001045 outpos = p-PyUnicode_AS_UNICODE(unicode);
1046 endinpos = s-starts;
1047 if (unicode_decode_call_errorhandler(
1048 errors, &errorHandler,
1049 "utf7", errmsg,
1050 starts, size, &startinpos, &endinpos, &exc, &s,
1051 (PyObject **)&unicode, &outpos, &p))
1052 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001053 }
1054
1055 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001056 outpos = p-PyUnicode_AS_UNICODE(unicode);
1057 endinpos = size;
1058 if (unicode_decode_call_errorhandler(
1059 errors, &errorHandler,
1060 "utf7", "unterminated shift sequence",
1061 starts, size, &startinpos, &endinpos, &exc, &s,
1062 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001063 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001064 if (s < e)
1065 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001066 }
1067
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001068 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001069 goto onError;
1070
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071 Py_XDECREF(errorHandler);
1072 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 return (PyObject *)unicode;
1074
1075onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001076 Py_XDECREF(errorHandler);
1077 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001078 Py_DECREF(unicode);
1079 return NULL;
1080}
1081
1082
1083PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001084 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001085 int encodeSetO,
1086 int encodeWhiteSpace,
1087 const char *errors)
1088{
1089 PyObject *v;
1090 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001091 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001092 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001093 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001094 unsigned int bitsleft = 0;
1095 unsigned long charsleft = 0;
1096 char * out;
1097 char * start;
1098
1099 if (size == 0)
1100 return PyString_FromStringAndSize(NULL, 0);
1101
1102 v = PyString_FromStringAndSize(NULL, cbAllocated);
1103 if (v == NULL)
1104 return NULL;
1105
1106 start = out = PyString_AS_STRING(v);
1107 for (;i < size; ++i) {
1108 Py_UNICODE ch = s[i];
1109
1110 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001111 if (ch == '+') {
1112 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001113 *out++ = '-';
1114 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1115 charsleft = ch;
1116 bitsleft = 16;
1117 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001118 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001120 } else {
1121 *out++ = (char) ch;
1122 }
1123 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001124 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1125 *out++ = B64(charsleft << (6-bitsleft));
1126 charsleft = 0;
1127 bitsleft = 0;
1128 /* Characters not in the BASE64 set implicitly unshift the sequence
1129 so no '-' is required, except if the character is itself a '-' */
1130 if (B64CHAR(ch) || ch == '-') {
1131 *out++ = '-';
1132 }
1133 inShift = 0;
1134 *out++ = (char) ch;
1135 } else {
1136 bitsleft += 16;
1137 charsleft = (charsleft << 16) | ch;
1138 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1139
1140 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001141 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001142 or '-' then the shift sequence will be terminated implicitly and we
1143 don't have to insert a '-'. */
1144
1145 if (bitsleft == 0) {
1146 if (i + 1 < size) {
1147 Py_UNICODE ch2 = s[i+1];
1148
1149 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001150
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001151 } else if (B64CHAR(ch2) || ch2 == '-') {
1152 *out++ = '-';
1153 inShift = 0;
1154 } else {
1155 inShift = 0;
1156 }
1157
1158 }
1159 else {
1160 *out++ = '-';
1161 inShift = 0;
1162 }
1163 }
Tim Petersced69f82003-09-16 20:30:58 +00001164 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001165 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001166 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001167 if (bitsleft) {
1168 *out++= B64(charsleft << (6-bitsleft) );
1169 *out++ = '-';
1170 }
1171
Tim Peters5de98422002-04-27 18:44:32 +00001172 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001173 return v;
1174}
1175
1176#undef SPECIAL
1177#undef B64
1178#undef B64CHAR
1179#undef UB64
1180#undef ENCODE
1181#undef DECODE
1182
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183/* --- UTF-8 Codec -------------------------------------------------------- */
1184
Tim Petersced69f82003-09-16 20:30:58 +00001185static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186char utf8_code_length[256] = {
1187 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1188 illegal prefix. see RFC 2279 for details */
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1193 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1194 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1195 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1196 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1201 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1202 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1203 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1204 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1205};
1206
Guido van Rossumd57fd912000-03-10 22:53:23 +00001207PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001208 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 const char *errors)
1210{
Walter Dörwald69652032004-09-07 20:24:22 +00001211 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1212}
1213
1214PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001215 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001216 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001217 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001218{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001219 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001221 Py_ssize_t startinpos;
1222 Py_ssize_t endinpos;
1223 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 const char *e;
1225 PyUnicodeObject *unicode;
1226 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001227 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001228 PyObject *errorHandler = NULL;
1229 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230
1231 /* Note: size will always be longer than the resulting Unicode
1232 character count */
1233 unicode = _PyUnicode_New(size);
1234 if (!unicode)
1235 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001236 if (size == 0) {
1237 if (consumed)
1238 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001241
1242 /* Unpack UTF-8 encoded data */
1243 p = unicode->str;
1244 e = s + size;
1245
1246 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001247 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248
1249 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001250 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 s++;
1252 continue;
1253 }
1254
1255 n = utf8_code_length[ch];
1256
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001257 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001258 if (consumed)
1259 break;
1260 else {
1261 errmsg = "unexpected end of data";
1262 startinpos = s-starts;
1263 endinpos = size;
1264 goto utf8Error;
1265 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001266 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267
1268 switch (n) {
1269
1270 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001271 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001272 startinpos = s-starts;
1273 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001274 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275
1276 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001277 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278 startinpos = s-starts;
1279 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001280 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281
1282 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001283 if ((s[1] & 0xc0) != 0x80) {
1284 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001285 startinpos = s-starts;
1286 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001287 goto utf8Error;
1288 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001290 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001291 startinpos = s-starts;
1292 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001293 errmsg = "illegal encoding";
1294 goto utf8Error;
1295 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001298 break;
1299
1300 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001301 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001302 (s[2] & 0xc0) != 0x80) {
1303 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001304 startinpos = s-starts;
1305 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001306 goto utf8Error;
1307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001309 if (ch < 0x0800) {
1310 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001311 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001312
1313 XXX For wide builds (UCS-4) we should probably try
1314 to recombine the surrogates into a single code
1315 unit.
1316 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001317 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001318 startinpos = s-starts;
1319 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001320 goto utf8Error;
1321 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001323 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001324 break;
1325
1326 case 4:
1327 if ((s[1] & 0xc0) != 0x80 ||
1328 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001329 (s[3] & 0xc0) != 0x80) {
1330 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331 startinpos = s-starts;
1332 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001333 goto utf8Error;
1334 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001335 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1336 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1337 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001338 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001339 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001340 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001341 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001342 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001343 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344 startinpos = s-starts;
1345 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001346 goto utf8Error;
1347 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001348#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001349 *p++ = (Py_UNICODE)ch;
1350#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001351 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001352
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001353 /* translate from 10000..10FFFF to 0..FFFF */
1354 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001355
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001356 /* high surrogate = top 10 bits added to D800 */
1357 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001358
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001359 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001360 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001361#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362 break;
1363
1364 default:
1365 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001366 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001367 startinpos = s-starts;
1368 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370 }
1371 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001372 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001373
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001374 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001375 outpos = p-PyUnicode_AS_UNICODE(unicode);
1376 if (unicode_decode_call_errorhandler(
1377 errors, &errorHandler,
1378 "utf8", errmsg,
1379 starts, size, &startinpos, &endinpos, &exc, &s,
1380 (PyObject **)&unicode, &outpos, &p))
1381 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 }
Walter Dörwald69652032004-09-07 20:24:22 +00001383 if (consumed)
1384 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001385
1386 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001387 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001388 goto onError;
1389
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 Py_XDECREF(errorHandler);
1391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 return (PyObject *)unicode;
1393
1394onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001395 Py_XDECREF(errorHandler);
1396 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001397 Py_DECREF(unicode);
1398 return NULL;
1399}
1400
Tim Peters602f7402002-04-27 18:03:26 +00001401/* Allocation strategy: if the string is short, convert into a stack buffer
1402 and allocate exactly as much space needed at the end. Else allocate the
1403 maximum possible needed (4 result bytes per Unicode character), and return
1404 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001405*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001406PyObject *
1407PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001408 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001409 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410{
Tim Peters602f7402002-04-27 18:03:26 +00001411#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001412
Martin v. Löwis18e16552006-02-15 17:27:45 +00001413 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001414 PyObject *v; /* result string object */
1415 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001416 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001417 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001418 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001419
Tim Peters602f7402002-04-27 18:03:26 +00001420 assert(s != NULL);
1421 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001422
Tim Peters602f7402002-04-27 18:03:26 +00001423 if (size <= MAX_SHORT_UNICHARS) {
1424 /* Write into the stack buffer; nallocated can't overflow.
1425 * At the end, we'll allocate exactly as much heap space as it
1426 * turns out we need.
1427 */
1428 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1429 v = NULL; /* will allocate after we're done */
1430 p = stackbuf;
1431 }
1432 else {
1433 /* Overallocate on the heap, and give the excess back at the end. */
1434 nallocated = size * 4;
1435 if (nallocated / 4 != size) /* overflow! */
1436 return PyErr_NoMemory();
1437 v = PyString_FromStringAndSize(NULL, nallocated);
1438 if (v == NULL)
1439 return NULL;
1440 p = PyString_AS_STRING(v);
1441 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001442
Tim Peters602f7402002-04-27 18:03:26 +00001443 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001444 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001445
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001446 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001447 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001448 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001449
Guido van Rossumd57fd912000-03-10 22:53:23 +00001450 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001451 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001452 *p++ = (char)(0xc0 | (ch >> 6));
1453 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001454 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001455 else {
Tim Peters602f7402002-04-27 18:03:26 +00001456 /* Encode UCS2 Unicode ordinals */
1457 if (ch < 0x10000) {
1458 /* Special case: check for high surrogate */
1459 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1460 Py_UCS4 ch2 = s[i];
1461 /* Check for low surrogate and combine the two to
1462 form a UCS4 value */
1463 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001464 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001465 i++;
1466 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001467 }
Tim Peters602f7402002-04-27 18:03:26 +00001468 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001469 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001470 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001471 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1472 *p++ = (char)(0x80 | (ch & 0x3f));
1473 continue;
1474 }
1475encodeUCS4:
1476 /* Encode UCS4 Unicode ordinals */
1477 *p++ = (char)(0xf0 | (ch >> 18));
1478 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1479 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1480 *p++ = (char)(0x80 | (ch & 0x3f));
1481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001482 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001483
Tim Peters602f7402002-04-27 18:03:26 +00001484 if (v == NULL) {
1485 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001486 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001487 assert(nneeded <= nallocated);
1488 v = PyString_FromStringAndSize(stackbuf, nneeded);
1489 }
1490 else {
1491 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001492 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001493 assert(nneeded <= nallocated);
1494 _PyString_Resize(&v, nneeded);
1495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001497
Tim Peters602f7402002-04-27 18:03:26 +00001498#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001499}
1500
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1502{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503 if (!PyUnicode_Check(unicode)) {
1504 PyErr_BadArgument();
1505 return NULL;
1506 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001507 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1508 PyUnicode_GET_SIZE(unicode),
1509 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001510}
1511
1512/* --- UTF-16 Codec ------------------------------------------------------- */
1513
Tim Peters772747b2001-08-09 22:21:55 +00001514PyObject *
1515PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001516 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001517 const char *errors,
1518 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519{
Walter Dörwald69652032004-09-07 20:24:22 +00001520 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1521}
1522
1523PyObject *
1524PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001525 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001526 const char *errors,
1527 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001528 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001529{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001530 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001531 Py_ssize_t startinpos;
1532 Py_ssize_t endinpos;
1533 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001534 PyUnicodeObject *unicode;
1535 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001536 const unsigned char *q, *e;
1537 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001538 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001539 /* Offsets from q for retrieving byte pairs in the right order. */
1540#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1541 int ihi = 1, ilo = 0;
1542#else
1543 int ihi = 0, ilo = 1;
1544#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001545 PyObject *errorHandler = NULL;
1546 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547
1548 /* Note: size will always be longer than the resulting Unicode
1549 character count */
1550 unicode = _PyUnicode_New(size);
1551 if (!unicode)
1552 return NULL;
1553 if (size == 0)
1554 return (PyObject *)unicode;
1555
1556 /* Unpack UTF-16 encoded data */
1557 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001558 q = (unsigned char *)s;
1559 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560
1561 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001562 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001564 /* Check for BOM marks (U+FEFF) in the input and adjust current
1565 byte order setting accordingly. In native mode, the leading BOM
1566 mark is skipped, in all other modes, it is copied to the output
1567 stream as-is (giving a ZWNBSP character). */
1568 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001569 if (size >= 2) {
1570 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001571#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001572 if (bom == 0xFEFF) {
1573 q += 2;
1574 bo = -1;
1575 }
1576 else if (bom == 0xFFFE) {
1577 q += 2;
1578 bo = 1;
1579 }
Tim Petersced69f82003-09-16 20:30:58 +00001580#else
Walter Dörwald69652032004-09-07 20:24:22 +00001581 if (bom == 0xFEFF) {
1582 q += 2;
1583 bo = 1;
1584 }
1585 else if (bom == 0xFFFE) {
1586 q += 2;
1587 bo = -1;
1588 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001589#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001590 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592
Tim Peters772747b2001-08-09 22:21:55 +00001593 if (bo == -1) {
1594 /* force LE */
1595 ihi = 1;
1596 ilo = 0;
1597 }
1598 else if (bo == 1) {
1599 /* force BE */
1600 ihi = 0;
1601 ilo = 1;
1602 }
1603
1604 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001605 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001606 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001607 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001608 if (consumed)
1609 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001610 errmsg = "truncated data";
1611 startinpos = ((const char *)q)-starts;
1612 endinpos = ((const char *)e)-starts;
1613 goto utf16Error;
1614 /* The remaining input chars are ignored if the callback
1615 chooses to skip the input */
1616 }
1617 ch = (q[ihi] << 8) | q[ilo];
1618
Tim Peters772747b2001-08-09 22:21:55 +00001619 q += 2;
1620
Guido van Rossumd57fd912000-03-10 22:53:23 +00001621 if (ch < 0xD800 || ch > 0xDFFF) {
1622 *p++ = ch;
1623 continue;
1624 }
1625
1626 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001627 if (q >= e) {
1628 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001629 startinpos = (((const char *)q)-2)-starts;
1630 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001631 goto utf16Error;
1632 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001633 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001634 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1635 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001636 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001637#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001638 *p++ = ch;
1639 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001640#else
1641 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001642#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001643 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001644 }
1645 else {
1646 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001647 startinpos = (((const char *)q)-4)-starts;
1648 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001649 goto utf16Error;
1650 }
1651
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001653 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001654 startinpos = (((const char *)q)-2)-starts;
1655 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001656 /* Fall through to report the error */
1657
1658 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001659 outpos = p-PyUnicode_AS_UNICODE(unicode);
1660 if (unicode_decode_call_errorhandler(
1661 errors, &errorHandler,
1662 "utf16", errmsg,
1663 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1664 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001665 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001666 }
1667
1668 if (byteorder)
1669 *byteorder = bo;
1670
Walter Dörwald69652032004-09-07 20:24:22 +00001671 if (consumed)
1672 *consumed = (const char *)q-starts;
1673
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001675 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001676 goto onError;
1677
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001678 Py_XDECREF(errorHandler);
1679 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001680 return (PyObject *)unicode;
1681
1682onError:
1683 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001684 Py_XDECREF(errorHandler);
1685 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686 return NULL;
1687}
1688
Tim Peters772747b2001-08-09 22:21:55 +00001689PyObject *
1690PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001691 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001692 const char *errors,
1693 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694{
1695 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001696 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001697#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001698 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001699#else
1700 const int pairs = 0;
1701#endif
Tim Peters772747b2001-08-09 22:21:55 +00001702 /* Offsets from p for storing byte pairs in the right order. */
1703#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1704 int ihi = 1, ilo = 0;
1705#else
1706 int ihi = 0, ilo = 1;
1707#endif
1708
1709#define STORECHAR(CH) \
1710 do { \
1711 p[ihi] = ((CH) >> 8) & 0xff; \
1712 p[ilo] = (CH) & 0xff; \
1713 p += 2; \
1714 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001716#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001717 for (i = pairs = 0; i < size; i++)
1718 if (s[i] >= 0x10000)
1719 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001720#endif
Tim Petersced69f82003-09-16 20:30:58 +00001721 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001722 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723 if (v == NULL)
1724 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725
Tim Peters772747b2001-08-09 22:21:55 +00001726 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001728 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001729 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001730 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001731
1732 if (byteorder == -1) {
1733 /* force LE */
1734 ihi = 1;
1735 ilo = 0;
1736 }
1737 else if (byteorder == 1) {
1738 /* force BE */
1739 ihi = 0;
1740 ilo = 1;
1741 }
1742
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001743 while (size-- > 0) {
1744 Py_UNICODE ch = *s++;
1745 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001746#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001747 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001748 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1749 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001751#endif
Tim Peters772747b2001-08-09 22:21:55 +00001752 STORECHAR(ch);
1753 if (ch2)
1754 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001755 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001757#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758}
1759
1760PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1761{
1762 if (!PyUnicode_Check(unicode)) {
1763 PyErr_BadArgument();
1764 return NULL;
1765 }
1766 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1767 PyUnicode_GET_SIZE(unicode),
1768 NULL,
1769 0);
1770}
1771
1772/* --- Unicode Escape Codec ----------------------------------------------- */
1773
Fredrik Lundh06d12682001-01-24 07:59:11 +00001774static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001775
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001777 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778 const char *errors)
1779{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001780 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001781 Py_ssize_t startinpos;
1782 Py_ssize_t endinpos;
1783 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001784 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001786 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001788 char* message;
1789 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 PyObject *errorHandler = NULL;
1791 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001792
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 /* Escaped strings will always be longer than the resulting
1794 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001795 length after conversion to the true value.
1796 (but if the error callback returns a long replacement string
1797 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798 v = _PyUnicode_New(size);
1799 if (v == NULL)
1800 goto onError;
1801 if (size == 0)
1802 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001803
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001804 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001806
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807 while (s < end) {
1808 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001809 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811
1812 /* Non-escape characters are interpreted as Unicode ordinals */
1813 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001814 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 continue;
1816 }
1817
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819 /* \ - Escapes */
1820 s++;
1821 switch (*s++) {
1822
1823 /* \x escapes */
1824 case '\n': break;
1825 case '\\': *p++ = '\\'; break;
1826 case '\'': *p++ = '\''; break;
1827 case '\"': *p++ = '\"'; break;
1828 case 'b': *p++ = '\b'; break;
1829 case 'f': *p++ = '\014'; break; /* FF */
1830 case 't': *p++ = '\t'; break;
1831 case 'n': *p++ = '\n'; break;
1832 case 'r': *p++ = '\r'; break;
1833 case 'v': *p++ = '\013'; break; /* VT */
1834 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1835
1836 /* \OOO (octal) escapes */
1837 case '0': case '1': case '2': case '3':
1838 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001839 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001841 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001843 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001845 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 break;
1847
Fredrik Lundhccc74732001-02-18 22:13:49 +00001848 /* hex escapes */
1849 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001850 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001851 digits = 2;
1852 message = "truncated \\xXX escape";
1853 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854
Fredrik Lundhccc74732001-02-18 22:13:49 +00001855 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001857 digits = 4;
1858 message = "truncated \\uXXXX escape";
1859 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860
Fredrik Lundhccc74732001-02-18 22:13:49 +00001861 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001862 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001863 digits = 8;
1864 message = "truncated \\UXXXXXXXX escape";
1865 hexescape:
1866 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 outpos = p-PyUnicode_AS_UNICODE(v);
1868 if (s+digits>end) {
1869 endinpos = size;
1870 if (unicode_decode_call_errorhandler(
1871 errors, &errorHandler,
1872 "unicodeescape", "end of string in escape sequence",
1873 starts, size, &startinpos, &endinpos, &exc, &s,
1874 (PyObject **)&v, &outpos, &p))
1875 goto onError;
1876 goto nextByte;
1877 }
1878 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001879 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001880 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001881 endinpos = (s+i+1)-starts;
1882 if (unicode_decode_call_errorhandler(
1883 errors, &errorHandler,
1884 "unicodeescape", message,
1885 starts, size, &startinpos, &endinpos, &exc, &s,
1886 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001887 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001888 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001889 }
1890 chr = (chr<<4) & ~0xF;
1891 if (c >= '0' && c <= '9')
1892 chr += c - '0';
1893 else if (c >= 'a' && c <= 'f')
1894 chr += 10 + c - 'a';
1895 else
1896 chr += 10 + c - 'A';
1897 }
1898 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001899 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001900 /* _decoding_error will have already written into the
1901 target buffer. */
1902 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001903 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001904 /* when we get here, chr is a 32-bit unicode character */
1905 if (chr <= 0xffff)
1906 /* UCS-2 character */
1907 *p++ = (Py_UNICODE) chr;
1908 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001909 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001910 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001911#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912 *p++ = chr;
1913#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001914 chr -= 0x10000L;
1915 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001916 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001917#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001918 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001919 endinpos = s-starts;
1920 outpos = p-PyUnicode_AS_UNICODE(v);
1921 if (unicode_decode_call_errorhandler(
1922 errors, &errorHandler,
1923 "unicodeescape", "illegal Unicode character",
1924 starts, size, &startinpos, &endinpos, &exc, &s,
1925 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001926 goto onError;
1927 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001928 break;
1929
1930 /* \N{name} */
1931 case 'N':
1932 message = "malformed \\N character escape";
1933 if (ucnhash_CAPI == NULL) {
1934 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001935 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001936 m = PyImport_ImportModule("unicodedata");
1937 if (m == NULL)
1938 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001939 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001940 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001941 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001942 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001943 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001944 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001945 if (ucnhash_CAPI == NULL)
1946 goto ucnhashError;
1947 }
1948 if (*s == '{') {
1949 const char *start = s+1;
1950 /* look for the closing brace */
1951 while (*s != '}' && s < end)
1952 s++;
1953 if (s > start && s < end && *s == '}') {
1954 /* found a name. look it up in the unicode database */
1955 message = "unknown Unicode character name";
1956 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001957 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001958 goto store;
1959 }
1960 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001961 endinpos = s-starts;
1962 outpos = p-PyUnicode_AS_UNICODE(v);
1963 if (unicode_decode_call_errorhandler(
1964 errors, &errorHandler,
1965 "unicodeescape", message,
1966 starts, size, &startinpos, &endinpos, &exc, &s,
1967 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001968 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001969 break;
1970
1971 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001972 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001973 message = "\\ at end of string";
1974 s--;
1975 endinpos = s-starts;
1976 outpos = p-PyUnicode_AS_UNICODE(v);
1977 if (unicode_decode_call_errorhandler(
1978 errors, &errorHandler,
1979 "unicodeescape", message,
1980 starts, size, &startinpos, &endinpos, &exc, &s,
1981 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001982 goto onError;
1983 }
1984 else {
1985 *p++ = '\\';
1986 *p++ = (unsigned char)s[-1];
1987 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001988 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001990 nextByte:
1991 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001993 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001994 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001995 Py_XDECREF(errorHandler);
1996 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001998
Fredrik Lundhccc74732001-02-18 22:13:49 +00001999ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002000 PyErr_SetString(
2001 PyExc_UnicodeError,
2002 "\\N escapes not supported (can't load unicodedata module)"
2003 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002004 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002005 Py_XDECREF(errorHandler);
2006 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002007 return NULL;
2008
Fredrik Lundhccc74732001-02-18 22:13:49 +00002009onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002011 Py_XDECREF(errorHandler);
2012 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013 return NULL;
2014}
2015
2016/* Return a Unicode-Escape string version of the Unicode object.
2017
2018 If quotes is true, the string is enclosed in u"" or u'' quotes as
2019 appropriate.
2020
2021*/
2022
Barry Warsaw51ac5802000-03-20 16:36:48 +00002023static const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002024 Py_ssize_t size,
Barry Warsaw51ac5802000-03-20 16:36:48 +00002025 Py_UNICODE ch);
2026
Guido van Rossumd57fd912000-03-10 22:53:23 +00002027static
2028PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002029 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002030 int quotes)
2031{
2032 PyObject *repr;
2033 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002035 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036
2037 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
2038 if (repr == NULL)
2039 return NULL;
2040
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002041 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042
2043 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002045 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046 !findchar(s, size, '"')) ? '"' : '\'';
2047 }
2048 while (size-- > 0) {
2049 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002050
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002051 /* Escape quotes and backslashes */
2052 if ((quotes &&
2053 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 *p++ = '\\';
2055 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002056 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002057 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002058
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002059#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002060 /* Map 21-bit characters to '\U00xxxxxx' */
2061 else if (ch >= 0x10000) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00002062 Py_ssize_t offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002063
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002064 /* Resize the string if necessary */
2065 if (offset + 12 > PyString_GET_SIZE(repr)) {
2066 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002067 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002068 p = PyString_AS_STRING(repr) + offset;
2069 }
2070
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002071 *p++ = '\\';
2072 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002073 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2074 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2075 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2076 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2077 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2078 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2079 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002080 *p++ = hexdigit[ch & 0x0000000F];
2081 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002082 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002083#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002084 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2085 else if (ch >= 0xD800 && ch < 0xDC00) {
2086 Py_UNICODE ch2;
2087 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002088
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002089 ch2 = *s++;
2090 size--;
2091 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2092 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2093 *p++ = '\\';
2094 *p++ = 'U';
2095 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2096 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2097 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2098 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2099 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2100 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2101 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2102 *p++ = hexdigit[ucs & 0x0000000F];
2103 continue;
2104 }
2105 /* Fall through: isolated surrogates are copied as-is */
2106 s--;
2107 size++;
2108 }
2109
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002111 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002112 *p++ = '\\';
2113 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002114 *p++ = hexdigit[(ch >> 12) & 0x000F];
2115 *p++ = hexdigit[(ch >> 8) & 0x000F];
2116 *p++ = hexdigit[(ch >> 4) & 0x000F];
2117 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002119
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002120 /* Map special whitespace to '\t', \n', '\r' */
2121 else if (ch == '\t') {
2122 *p++ = '\\';
2123 *p++ = 't';
2124 }
2125 else if (ch == '\n') {
2126 *p++ = '\\';
2127 *p++ = 'n';
2128 }
2129 else if (ch == '\r') {
2130 *p++ = '\\';
2131 *p++ = 'r';
2132 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002133
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002134 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002135 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002137 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002138 *p++ = hexdigit[(ch >> 4) & 0x000F];
2139 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002140 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002141
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 /* Copy everything else as-is */
2143 else
2144 *p++ = (char) ch;
2145 }
2146 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002147 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148
2149 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002150 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002151 return repr;
2152}
2153
2154PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002155 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156{
2157 return unicodeescape_string(s, size, 0);
2158}
2159
2160PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2161{
2162 if (!PyUnicode_Check(unicode)) {
2163 PyErr_BadArgument();
2164 return NULL;
2165 }
2166 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2167 PyUnicode_GET_SIZE(unicode));
2168}
2169
2170/* --- Raw Unicode Escape Codec ------------------------------------------- */
2171
2172PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002173 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174 const char *errors)
2175{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002176 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002177 Py_ssize_t startinpos;
2178 Py_ssize_t endinpos;
2179 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002181 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 const char *end;
2183 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002184 PyObject *errorHandler = NULL;
2185 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002186
Guido van Rossumd57fd912000-03-10 22:53:23 +00002187 /* Escaped strings will always be longer than the resulting
2188 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002189 length after conversion to the true value. (But decoding error
2190 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191 v = _PyUnicode_New(size);
2192 if (v == NULL)
2193 goto onError;
2194 if (size == 0)
2195 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002196 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 end = s + size;
2198 while (s < end) {
2199 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002200 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002202 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203
2204 /* Non-escape characters are interpreted as Unicode ordinals */
2205 if (*s != '\\') {
2206 *p++ = (unsigned char)*s++;
2207 continue;
2208 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002209 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210
2211 /* \u-escapes are only interpreted iff the number of leading
2212 backslashes if odd */
2213 bs = s;
2214 for (;s < end;) {
2215 if (*s != '\\')
2216 break;
2217 *p++ = (unsigned char)*s++;
2218 }
2219 if (((s - bs) & 1) == 0 ||
2220 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002221 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 continue;
2223 }
2224 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002225 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226 s++;
2227
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002228 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002229 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002230 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002231 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002233 endinpos = s-starts;
2234 if (unicode_decode_call_errorhandler(
2235 errors, &errorHandler,
2236 "rawunicodeescape", "truncated \\uXXXX",
2237 starts, size, &startinpos, &endinpos, &exc, &s,
2238 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002240 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241 }
2242 x = (x<<4) & ~0xF;
2243 if (c >= '0' && c <= '9')
2244 x += c - '0';
2245 else if (c >= 'a' && c <= 'f')
2246 x += 10 + c - 'a';
2247 else
2248 x += 10 + c - 'A';
2249 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002250#ifndef Py_UNICODE_WIDE
2251 if (x > 0x10000) {
2252 if (unicode_decode_call_errorhandler(
2253 errors, &errorHandler,
2254 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2255 starts, size, &startinpos, &endinpos, &exc, &s,
2256 (PyObject **)&v, &outpos, &p))
2257 goto onError;
2258 }
2259#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002260 *p++ = x;
2261 nextByte:
2262 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002264 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002265 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266 Py_XDECREF(errorHandler);
2267 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002269
Guido van Rossumd57fd912000-03-10 22:53:23 +00002270 onError:
2271 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002272 Py_XDECREF(errorHandler);
2273 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 return NULL;
2275}
2276
2277PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002278 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279{
2280 PyObject *repr;
2281 char *p;
2282 char *q;
2283
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002284 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002286#ifdef Py_UNICODE_WIDE
2287 repr = PyString_FromStringAndSize(NULL, 10 * size);
2288#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002290#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 if (repr == NULL)
2292 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002293 if (size == 0)
2294 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295
2296 p = q = PyString_AS_STRING(repr);
2297 while (size-- > 0) {
2298 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002299#ifdef Py_UNICODE_WIDE
2300 /* Map 32-bit characters to '\Uxxxxxxxx' */
2301 if (ch >= 0x10000) {
2302 *p++ = '\\';
2303 *p++ = 'U';
2304 *p++ = hexdigit[(ch >> 28) & 0xf];
2305 *p++ = hexdigit[(ch >> 24) & 0xf];
2306 *p++ = hexdigit[(ch >> 20) & 0xf];
2307 *p++ = hexdigit[(ch >> 16) & 0xf];
2308 *p++ = hexdigit[(ch >> 12) & 0xf];
2309 *p++ = hexdigit[(ch >> 8) & 0xf];
2310 *p++ = hexdigit[(ch >> 4) & 0xf];
2311 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002312 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002313 else
2314#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002315 /* Map 16-bit characters to '\uxxxx' */
2316 if (ch >= 256) {
2317 *p++ = '\\';
2318 *p++ = 'u';
2319 *p++ = hexdigit[(ch >> 12) & 0xf];
2320 *p++ = hexdigit[(ch >> 8) & 0xf];
2321 *p++ = hexdigit[(ch >> 4) & 0xf];
2322 *p++ = hexdigit[ch & 15];
2323 }
2324 /* Copy everything else as-is */
2325 else
2326 *p++ = (char) ch;
2327 }
2328 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002329 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330 return repr;
2331}
2332
2333PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2334{
2335 if (!PyUnicode_Check(unicode)) {
2336 PyErr_BadArgument();
2337 return NULL;
2338 }
2339 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2340 PyUnicode_GET_SIZE(unicode));
2341}
2342
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002343/* --- Unicode Internal Codec ------------------------------------------- */
2344
2345PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002346 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002347 const char *errors)
2348{
2349 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002350 Py_ssize_t startinpos;
2351 Py_ssize_t endinpos;
2352 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002353 PyUnicodeObject *v;
2354 Py_UNICODE *p;
2355 const char *end;
2356 const char *reason;
2357 PyObject *errorHandler = NULL;
2358 PyObject *exc = NULL;
2359
Neal Norwitzd43069c2006-01-08 01:12:10 +00002360#ifdef Py_UNICODE_WIDE
2361 Py_UNICODE unimax = PyUnicode_GetMax();
2362#endif
2363
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002364 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2365 if (v == NULL)
2366 goto onError;
2367 if (PyUnicode_GetSize((PyObject *)v) == 0)
2368 return (PyObject *)v;
2369 p = PyUnicode_AS_UNICODE(v);
2370 end = s + size;
2371
2372 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002373 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002374 /* We have to sanity check the raw data, otherwise doom looms for
2375 some malformed UCS-4 data. */
2376 if (
2377 #ifdef Py_UNICODE_WIDE
2378 *p > unimax || *p < 0 ||
2379 #endif
2380 end-s < Py_UNICODE_SIZE
2381 )
2382 {
2383 startinpos = s - starts;
2384 if (end-s < Py_UNICODE_SIZE) {
2385 endinpos = end-starts;
2386 reason = "truncated input";
2387 }
2388 else {
2389 endinpos = s - starts + Py_UNICODE_SIZE;
2390 reason = "illegal code point (> 0x10FFFF)";
2391 }
2392 outpos = p - PyUnicode_AS_UNICODE(v);
2393 if (unicode_decode_call_errorhandler(
2394 errors, &errorHandler,
2395 "unicode_internal", reason,
2396 starts, size, &startinpos, &endinpos, &exc, &s,
2397 (PyObject **)&v, &outpos, &p)) {
2398 goto onError;
2399 }
2400 }
2401 else {
2402 p++;
2403 s += Py_UNICODE_SIZE;
2404 }
2405 }
2406
Martin v. Löwis412fb672006-04-13 06:34:32 +00002407 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002408 goto onError;
2409 Py_XDECREF(errorHandler);
2410 Py_XDECREF(exc);
2411 return (PyObject *)v;
2412
2413 onError:
2414 Py_XDECREF(v);
2415 Py_XDECREF(errorHandler);
2416 Py_XDECREF(exc);
2417 return NULL;
2418}
2419
Guido van Rossumd57fd912000-03-10 22:53:23 +00002420/* --- Latin-1 Codec ------------------------------------------------------ */
2421
2422PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002423 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002424 const char *errors)
2425{
2426 PyUnicodeObject *v;
2427 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002428
Guido van Rossumd57fd912000-03-10 22:53:23 +00002429 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002430 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002431 Py_UNICODE r = *(unsigned char*)s;
2432 return PyUnicode_FromUnicode(&r, 1);
2433 }
2434
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435 v = _PyUnicode_New(size);
2436 if (v == NULL)
2437 goto onError;
2438 if (size == 0)
2439 return (PyObject *)v;
2440 p = PyUnicode_AS_UNICODE(v);
2441 while (size-- > 0)
2442 *p++ = (unsigned char)*s++;
2443 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002444
Guido van Rossumd57fd912000-03-10 22:53:23 +00002445 onError:
2446 Py_XDECREF(v);
2447 return NULL;
2448}
2449
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002450/* create or adjust a UnicodeEncodeError */
2451static void make_encode_exception(PyObject **exceptionObject,
2452 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002453 const Py_UNICODE *unicode, Py_ssize_t size,
2454 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002455 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002456{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002457 if (*exceptionObject == NULL) {
2458 *exceptionObject = PyUnicodeEncodeError_Create(
2459 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460 }
2461 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002462 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2463 goto onError;
2464 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2465 goto onError;
2466 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2467 goto onError;
2468 return;
2469 onError:
2470 Py_DECREF(*exceptionObject);
2471 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002472 }
2473}
2474
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002475/* raises a UnicodeEncodeError */
2476static void raise_encode_exception(PyObject **exceptionObject,
2477 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002478 const Py_UNICODE *unicode, Py_ssize_t size,
2479 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002480 const char *reason)
2481{
2482 make_encode_exception(exceptionObject,
2483 encoding, unicode, size, startpos, endpos, reason);
2484 if (*exceptionObject != NULL)
2485 PyCodec_StrictErrors(*exceptionObject);
2486}
2487
2488/* error handling callback helper:
2489 build arguments, call the callback and check the arguments,
2490 put the result into newpos and return the replacement string, which
2491 has to be freed by the caller */
2492static PyObject *unicode_encode_call_errorhandler(const char *errors,
2493 PyObject **errorHandler,
2494 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002495 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2496 Py_ssize_t startpos, Py_ssize_t endpos,
2497 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002498{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002499 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002500
2501 PyObject *restuple;
2502 PyObject *resunicode;
2503
2504 if (*errorHandler == NULL) {
2505 *errorHandler = PyCodec_LookupError(errors);
2506 if (*errorHandler == NULL)
2507 return NULL;
2508 }
2509
2510 make_encode_exception(exceptionObject,
2511 encoding, unicode, size, startpos, endpos, reason);
2512 if (*exceptionObject == NULL)
2513 return NULL;
2514
2515 restuple = PyObject_CallFunctionObjArgs(
2516 *errorHandler, *exceptionObject, NULL);
2517 if (restuple == NULL)
2518 return NULL;
2519 if (!PyTuple_Check(restuple)) {
2520 PyErr_Format(PyExc_TypeError, &argparse[4]);
2521 Py_DECREF(restuple);
2522 return NULL;
2523 }
2524 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2525 &resunicode, newpos)) {
2526 Py_DECREF(restuple);
2527 return NULL;
2528 }
2529 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002530 *newpos = size+*newpos;
2531 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002532 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002533 Py_DECREF(restuple);
2534 return NULL;
2535 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002536 Py_INCREF(resunicode);
2537 Py_DECREF(restuple);
2538 return resunicode;
2539}
2540
2541static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002542 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002543 const char *errors,
2544 int limit)
2545{
2546 /* output object */
2547 PyObject *res;
2548 /* pointers to the beginning and end+1 of input */
2549 const Py_UNICODE *startp = p;
2550 const Py_UNICODE *endp = p + size;
2551 /* pointer to the beginning of the unencodable characters */
2552 /* const Py_UNICODE *badp = NULL; */
2553 /* pointer into the output */
2554 char *str;
2555 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002556 Py_ssize_t respos = 0;
2557 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002558 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2559 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002560 PyObject *errorHandler = NULL;
2561 PyObject *exc = NULL;
2562 /* the following variable is used for caching string comparisons
2563 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2564 int known_errorHandler = -1;
2565
2566 /* allocate enough for a simple encoding without
2567 replacements, if we need more, we'll resize */
2568 res = PyString_FromStringAndSize(NULL, size);
2569 if (res == NULL)
2570 goto onError;
2571 if (size == 0)
2572 return res;
2573 str = PyString_AS_STRING(res);
2574 ressize = size;
2575
2576 while (p<endp) {
2577 Py_UNICODE c = *p;
2578
2579 /* can we encode this? */
2580 if (c<limit) {
2581 /* no overflow check, because we know that the space is enough */
2582 *str++ = (char)c;
2583 ++p;
2584 }
2585 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002586 Py_ssize_t unicodepos = p-startp;
2587 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002588 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002589 Py_ssize_t repsize;
2590 Py_ssize_t newpos;
2591 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002592 Py_UNICODE *uni2;
2593 /* startpos for collecting unencodable chars */
2594 const Py_UNICODE *collstart = p;
2595 const Py_UNICODE *collend = p;
2596 /* find all unecodable characters */
2597 while ((collend < endp) && ((*collend)>=limit))
2598 ++collend;
2599 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2600 if (known_errorHandler==-1) {
2601 if ((errors==NULL) || (!strcmp(errors, "strict")))
2602 known_errorHandler = 1;
2603 else if (!strcmp(errors, "replace"))
2604 known_errorHandler = 2;
2605 else if (!strcmp(errors, "ignore"))
2606 known_errorHandler = 3;
2607 else if (!strcmp(errors, "xmlcharrefreplace"))
2608 known_errorHandler = 4;
2609 else
2610 known_errorHandler = 0;
2611 }
2612 switch (known_errorHandler) {
2613 case 1: /* strict */
2614 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2615 goto onError;
2616 case 2: /* replace */
2617 while (collstart++<collend)
2618 *str++ = '?'; /* fall through */
2619 case 3: /* ignore */
2620 p = collend;
2621 break;
2622 case 4: /* xmlcharrefreplace */
2623 respos = str-PyString_AS_STRING(res);
2624 /* determine replacement size (temporarily (mis)uses p) */
2625 for (p = collstart, repsize = 0; p < collend; ++p) {
2626 if (*p<10)
2627 repsize += 2+1+1;
2628 else if (*p<100)
2629 repsize += 2+2+1;
2630 else if (*p<1000)
2631 repsize += 2+3+1;
2632 else if (*p<10000)
2633 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002634#ifndef Py_UNICODE_WIDE
2635 else
2636 repsize += 2+5+1;
2637#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002638 else if (*p<100000)
2639 repsize += 2+5+1;
2640 else if (*p<1000000)
2641 repsize += 2+6+1;
2642 else
2643 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002644#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002645 }
2646 requiredsize = respos+repsize+(endp-collend);
2647 if (requiredsize > ressize) {
2648 if (requiredsize<2*ressize)
2649 requiredsize = 2*ressize;
2650 if (_PyString_Resize(&res, requiredsize))
2651 goto onError;
2652 str = PyString_AS_STRING(res) + respos;
2653 ressize = requiredsize;
2654 }
2655 /* generate replacement (temporarily (mis)uses p) */
2656 for (p = collstart; p < collend; ++p) {
2657 str += sprintf(str, "&#%d;", (int)*p);
2658 }
2659 p = collend;
2660 break;
2661 default:
2662 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2663 encoding, reason, startp, size, &exc,
2664 collstart-startp, collend-startp, &newpos);
2665 if (repunicode == NULL)
2666 goto onError;
2667 /* need more space? (at least enough for what we
2668 have+the replacement+the rest of the string, so
2669 we won't have to check space for encodable characters) */
2670 respos = str-PyString_AS_STRING(res);
2671 repsize = PyUnicode_GET_SIZE(repunicode);
2672 requiredsize = respos+repsize+(endp-collend);
2673 if (requiredsize > ressize) {
2674 if (requiredsize<2*ressize)
2675 requiredsize = 2*ressize;
2676 if (_PyString_Resize(&res, requiredsize)) {
2677 Py_DECREF(repunicode);
2678 goto onError;
2679 }
2680 str = PyString_AS_STRING(res) + respos;
2681 ressize = requiredsize;
2682 }
2683 /* check if there is anything unencodable in the replacement
2684 and copy it to the output */
2685 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2686 c = *uni2;
2687 if (c >= limit) {
2688 raise_encode_exception(&exc, encoding, startp, size,
2689 unicodepos, unicodepos+1, reason);
2690 Py_DECREF(repunicode);
2691 goto onError;
2692 }
2693 *str = (char)c;
2694 }
2695 p = startp + newpos;
2696 Py_DECREF(repunicode);
2697 }
2698 }
2699 }
2700 /* Resize if we allocated to much */
2701 respos = str-PyString_AS_STRING(res);
2702 if (respos<ressize)
2703 /* If this falls res will be NULL */
2704 _PyString_Resize(&res, respos);
2705 Py_XDECREF(errorHandler);
2706 Py_XDECREF(exc);
2707 return res;
2708
2709 onError:
2710 Py_XDECREF(res);
2711 Py_XDECREF(errorHandler);
2712 Py_XDECREF(exc);
2713 return NULL;
2714}
2715
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002717 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718 const char *errors)
2719{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002720 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721}
2722
2723PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2724{
2725 if (!PyUnicode_Check(unicode)) {
2726 PyErr_BadArgument();
2727 return NULL;
2728 }
2729 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2730 PyUnicode_GET_SIZE(unicode),
2731 NULL);
2732}
2733
2734/* --- 7-bit ASCII Codec -------------------------------------------------- */
2735
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002737 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 const char *errors)
2739{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002740 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 PyUnicodeObject *v;
2742 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002743 Py_ssize_t startinpos;
2744 Py_ssize_t endinpos;
2745 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 const char *e;
2747 PyObject *errorHandler = NULL;
2748 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002749
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002751 if (size == 1 && *(unsigned char*)s < 128) {
2752 Py_UNICODE r = *(unsigned char*)s;
2753 return PyUnicode_FromUnicode(&r, 1);
2754 }
Tim Petersced69f82003-09-16 20:30:58 +00002755
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 v = _PyUnicode_New(size);
2757 if (v == NULL)
2758 goto onError;
2759 if (size == 0)
2760 return (PyObject *)v;
2761 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002762 e = s + size;
2763 while (s < e) {
2764 register unsigned char c = (unsigned char)*s;
2765 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002767 ++s;
2768 }
2769 else {
2770 startinpos = s-starts;
2771 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002772 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002773 if (unicode_decode_call_errorhandler(
2774 errors, &errorHandler,
2775 "ascii", "ordinal not in range(128)",
2776 starts, size, &startinpos, &endinpos, &exc, &s,
2777 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002781 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002782 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002783 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002784 Py_XDECREF(errorHandler);
2785 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002787
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788 onError:
2789 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002790 Py_XDECREF(errorHandler);
2791 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 return NULL;
2793}
2794
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002796 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797 const char *errors)
2798{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002799 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800}
2801
2802PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2803{
2804 if (!PyUnicode_Check(unicode)) {
2805 PyErr_BadArgument();
2806 return NULL;
2807 }
2808 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2809 PyUnicode_GET_SIZE(unicode),
2810 NULL);
2811}
2812
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002813#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002814
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002815/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002816
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002817PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002818 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002819 const char *errors)
2820{
2821 PyUnicodeObject *v;
2822 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002823 DWORD usize;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002824
2825 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002826 assert(size < INT_MAX);
2827 usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002828 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002829 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2830
2831 v = _PyUnicode_New(usize);
2832 if (v == NULL)
2833 return NULL;
2834 if (usize == 0)
2835 return (PyObject *)v;
2836 p = PyUnicode_AS_UNICODE(v);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002837 if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002838 Py_DECREF(v);
2839 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2840 }
2841
2842 return (PyObject *)v;
2843}
2844
2845PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002846 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002847 const char *errors)
2848{
2849 PyObject *repr;
2850 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002851 DWORD mbcssize;
2852
2853 /* If there are no characters, bail now! */
2854 if (size==0)
2855 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002856
2857 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002858 assert(size<INT_MAX);
2859 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002860 if (mbcssize==0)
2861 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2862
2863 repr = PyString_FromStringAndSize(NULL, mbcssize);
2864 if (repr == NULL)
2865 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002866 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002867 return repr;
2868
2869 /* Do the conversion */
2870 s = PyString_AS_STRING(repr);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002871 assert(size < INT_MAX);
2872 if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002873 Py_DECREF(repr);
2874 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2875 }
2876 return repr;
2877}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002878
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002879PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2880{
2881 if (!PyUnicode_Check(unicode)) {
2882 PyErr_BadArgument();
2883 return NULL;
2884 }
2885 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2886 PyUnicode_GET_SIZE(unicode),
2887 NULL);
2888}
2889
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002890#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002891
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892/* --- Character Mapping Codec -------------------------------------------- */
2893
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002895 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002896 PyObject *mapping,
2897 const char *errors)
2898{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002899 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002900 Py_ssize_t startinpos;
2901 Py_ssize_t endinpos;
2902 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002903 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904 PyUnicodeObject *v;
2905 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002906 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002907 PyObject *errorHandler = NULL;
2908 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002909 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002910 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002911
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912 /* Default to Latin-1 */
2913 if (mapping == NULL)
2914 return PyUnicode_DecodeLatin1(s, size, errors);
2915
2916 v = _PyUnicode_New(size);
2917 if (v == NULL)
2918 goto onError;
2919 if (size == 0)
2920 return (PyObject *)v;
2921 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002922 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002923 if (PyUnicode_CheckExact(mapping)) {
2924 mapstring = PyUnicode_AS_UNICODE(mapping);
2925 maplen = PyUnicode_GET_SIZE(mapping);
2926 while (s < e) {
2927 unsigned char ch = *s;
2928 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002929
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002930 if (ch < maplen)
2931 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002933 if (x == 0xfffe) {
2934 /* undefined mapping */
2935 outpos = p-PyUnicode_AS_UNICODE(v);
2936 startinpos = s-starts;
2937 endinpos = startinpos+1;
2938 if (unicode_decode_call_errorhandler(
2939 errors, &errorHandler,
2940 "charmap", "character maps to <undefined>",
2941 starts, size, &startinpos, &endinpos, &exc, &s,
2942 (PyObject **)&v, &outpos, &p)) {
2943 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002944 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002945 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002946 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002947 *p++ = x;
2948 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002950 }
2951 else {
2952 while (s < e) {
2953 unsigned char ch = *s;
2954 PyObject *w, *x;
2955
2956 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2957 w = PyInt_FromLong((long)ch);
2958 if (w == NULL)
2959 goto onError;
2960 x = PyObject_GetItem(mapping, w);
2961 Py_DECREF(w);
2962 if (x == NULL) {
2963 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2964 /* No mapping found means: mapping is undefined. */
2965 PyErr_Clear();
2966 x = Py_None;
2967 Py_INCREF(x);
2968 } else
2969 goto onError;
2970 }
2971
2972 /* Apply mapping */
2973 if (PyInt_Check(x)) {
2974 long value = PyInt_AS_LONG(x);
2975 if (value < 0 || value > 65535) {
2976 PyErr_SetString(PyExc_TypeError,
2977 "character mapping must be in range(65536)");
2978 Py_DECREF(x);
2979 goto onError;
2980 }
2981 *p++ = (Py_UNICODE)value;
2982 }
2983 else if (x == Py_None) {
2984 /* undefined mapping */
2985 outpos = p-PyUnicode_AS_UNICODE(v);
2986 startinpos = s-starts;
2987 endinpos = startinpos+1;
2988 if (unicode_decode_call_errorhandler(
2989 errors, &errorHandler,
2990 "charmap", "character maps to <undefined>",
2991 starts, size, &startinpos, &endinpos, &exc, &s,
2992 (PyObject **)&v, &outpos, &p)) {
2993 Py_DECREF(x);
2994 goto onError;
2995 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00002996 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002997 continue;
2998 }
2999 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003000 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003001
3002 if (targetsize == 1)
3003 /* 1-1 mapping */
3004 *p++ = *PyUnicode_AS_UNICODE(x);
3005
3006 else if (targetsize > 1) {
3007 /* 1-n mapping */
3008 if (targetsize > extrachars) {
3009 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003010 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3011 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003012 (targetsize << 2);
3013 extrachars += needed;
3014 if (_PyUnicode_Resize(&v,
3015 PyUnicode_GET_SIZE(v) + needed) < 0) {
3016 Py_DECREF(x);
3017 goto onError;
3018 }
3019 p = PyUnicode_AS_UNICODE(v) + oldpos;
3020 }
3021 Py_UNICODE_COPY(p,
3022 PyUnicode_AS_UNICODE(x),
3023 targetsize);
3024 p += targetsize;
3025 extrachars -= targetsize;
3026 }
3027 /* 1-0 mapping: skip the character */
3028 }
3029 else {
3030 /* wrong return value */
3031 PyErr_SetString(PyExc_TypeError,
3032 "character mapping must return integer, None or unicode");
3033 Py_DECREF(x);
3034 goto onError;
3035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003037 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 }
3040 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003041 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003043 Py_XDECREF(errorHandler);
3044 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003046
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048 Py_XDECREF(errorHandler);
3049 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 Py_XDECREF(v);
3051 return NULL;
3052}
3053
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003054/* Lookup the character ch in the mapping. If the character
3055 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003056 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003057static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 PyObject *w = PyInt_FromLong((long)c);
3060 PyObject *x;
3061
3062 if (w == NULL)
3063 return NULL;
3064 x = PyObject_GetItem(mapping, w);
3065 Py_DECREF(w);
3066 if (x == NULL) {
3067 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3068 /* No mapping found means: mapping is undefined. */
3069 PyErr_Clear();
3070 x = Py_None;
3071 Py_INCREF(x);
3072 return x;
3073 } else
3074 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003076 else if (x == Py_None)
3077 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003078 else if (PyInt_Check(x)) {
3079 long value = PyInt_AS_LONG(x);
3080 if (value < 0 || value > 255) {
3081 PyErr_SetString(PyExc_TypeError,
3082 "character mapping must be in range(256)");
3083 Py_DECREF(x);
3084 return NULL;
3085 }
3086 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003088 else if (PyString_Check(x))
3089 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003091 /* wrong return value */
3092 PyErr_SetString(PyExc_TypeError,
3093 "character mapping must return integer, None or str");
3094 Py_DECREF(x);
3095 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 }
3097}
3098
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003099/* lookup the character, put the result in the output string and adjust
3100 various state variables. Reallocate the output string if not enough
3101 space is available. Return a new reference to the object that
3102 was put in the output buffer, or Py_None, if the mapping was undefined
3103 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003104 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003105static
3106PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003107 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003108{
3109 PyObject *rep = charmapencode_lookup(c, mapping);
3110
3111 if (rep==NULL)
3112 return NULL;
3113 else if (rep==Py_None)
3114 return rep;
3115 else {
3116 char *outstart = PyString_AS_STRING(*outobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003117 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003118 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003119 Py_ssize_t requiredsize = *outpos+1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003120 if (outsize<requiredsize) {
3121 /* exponentially overallocate to minimize reallocations */
3122 if (requiredsize < 2*outsize)
3123 requiredsize = 2*outsize;
3124 if (_PyString_Resize(outobj, requiredsize)) {
3125 Py_DECREF(rep);
3126 return NULL;
3127 }
3128 outstart = PyString_AS_STRING(*outobj);
3129 }
3130 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3131 }
3132 else {
3133 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003134 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3135 Py_ssize_t requiredsize = *outpos+repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003136 if (outsize<requiredsize) {
3137 /* exponentially overallocate to minimize reallocations */
3138 if (requiredsize < 2*outsize)
3139 requiredsize = 2*outsize;
3140 if (_PyString_Resize(outobj, requiredsize)) {
3141 Py_DECREF(rep);
3142 return NULL;
3143 }
3144 outstart = PyString_AS_STRING(*outobj);
3145 }
3146 memcpy(outstart + *outpos, repchars, repsize);
3147 *outpos += repsize;
3148 }
3149 }
3150 return rep;
3151}
3152
3153/* handle an error in PyUnicode_EncodeCharmap
3154 Return 0 on success, -1 on error */
3155static
3156int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003157 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003158 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003159 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003160 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003161{
3162 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003163 Py_ssize_t repsize;
3164 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003165 Py_UNICODE *uni2;
3166 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003167 Py_ssize_t collstartpos = *inpos;
3168 Py_ssize_t collendpos = *inpos+1;
3169 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003170 char *encoding = "charmap";
3171 char *reason = "character maps to <undefined>";
3172
3173 PyObject *x;
3174 /* find all unencodable characters */
3175 while (collendpos < size) {
3176 x = charmapencode_lookup(p[collendpos], mapping);
3177 if (x==NULL)
3178 return -1;
3179 else if (x!=Py_None) {
3180 Py_DECREF(x);
3181 break;
3182 }
3183 Py_DECREF(x);
3184 ++collendpos;
3185 }
3186 /* cache callback name lookup
3187 * (if not done yet, i.e. it's the first error) */
3188 if (*known_errorHandler==-1) {
3189 if ((errors==NULL) || (!strcmp(errors, "strict")))
3190 *known_errorHandler = 1;
3191 else if (!strcmp(errors, "replace"))
3192 *known_errorHandler = 2;
3193 else if (!strcmp(errors, "ignore"))
3194 *known_errorHandler = 3;
3195 else if (!strcmp(errors, "xmlcharrefreplace"))
3196 *known_errorHandler = 4;
3197 else
3198 *known_errorHandler = 0;
3199 }
3200 switch (*known_errorHandler) {
3201 case 1: /* strict */
3202 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3203 return -1;
3204 case 2: /* replace */
3205 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3206 x = charmapencode_output('?', mapping, res, respos);
3207 if (x==NULL) {
3208 return -1;
3209 }
3210 else if (x==Py_None) {
3211 Py_DECREF(x);
3212 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3213 return -1;
3214 }
3215 Py_DECREF(x);
3216 }
3217 /* fall through */
3218 case 3: /* ignore */
3219 *inpos = collendpos;
3220 break;
3221 case 4: /* xmlcharrefreplace */
3222 /* generate replacement (temporarily (mis)uses p) */
3223 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3224 char buffer[2+29+1+1];
3225 char *cp;
3226 sprintf(buffer, "&#%d;", (int)p[collpos]);
3227 for (cp = buffer; *cp; ++cp) {
3228 x = charmapencode_output(*cp, mapping, res, respos);
3229 if (x==NULL)
3230 return -1;
3231 else if (x==Py_None) {
3232 Py_DECREF(x);
3233 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3234 return -1;
3235 }
3236 Py_DECREF(x);
3237 }
3238 }
3239 *inpos = collendpos;
3240 break;
3241 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003242 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003243 encoding, reason, p, size, exceptionObject,
3244 collstartpos, collendpos, &newpos);
3245 if (repunicode == NULL)
3246 return -1;
3247 /* generate replacement */
3248 repsize = PyUnicode_GET_SIZE(repunicode);
3249 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3250 x = charmapencode_output(*uni2, mapping, res, respos);
3251 if (x==NULL) {
3252 Py_DECREF(repunicode);
3253 return -1;
3254 }
3255 else if (x==Py_None) {
3256 Py_DECREF(repunicode);
3257 Py_DECREF(x);
3258 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3259 return -1;
3260 }
3261 Py_DECREF(x);
3262 }
3263 *inpos = newpos;
3264 Py_DECREF(repunicode);
3265 }
3266 return 0;
3267}
3268
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003270 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 PyObject *mapping,
3272 const char *errors)
3273{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003274 /* output object */
3275 PyObject *res = NULL;
3276 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003277 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003278 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003279 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003280 PyObject *errorHandler = NULL;
3281 PyObject *exc = NULL;
3282 /* the following variable is used for caching string comparisons
3283 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3284 * 3=ignore, 4=xmlcharrefreplace */
3285 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286
3287 /* Default to Latin-1 */
3288 if (mapping == NULL)
3289 return PyUnicode_EncodeLatin1(p, size, errors);
3290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003291 /* allocate enough for a simple encoding without
3292 replacements, if we need more, we'll resize */
3293 res = PyString_FromStringAndSize(NULL, size);
3294 if (res == NULL)
3295 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003296 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003297 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003299 while (inpos<size) {
3300 /* try to encode it */
3301 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3302 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003304 if (x==Py_None) { /* unencodable character */
3305 if (charmap_encoding_error(p, size, &inpos, mapping,
3306 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003307 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003308 &res, &respos)) {
3309 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003310 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003311 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003313 else
3314 /* done with this character => adjust input position */
3315 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316 Py_DECREF(x);
3317 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003319 /* Resize if we allocated to much */
3320 if (respos<PyString_GET_SIZE(res)) {
3321 if (_PyString_Resize(&res, respos))
3322 goto onError;
3323 }
3324 Py_XDECREF(exc);
3325 Py_XDECREF(errorHandler);
3326 return res;
3327
3328 onError:
3329 Py_XDECREF(res);
3330 Py_XDECREF(exc);
3331 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 return NULL;
3333}
3334
3335PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3336 PyObject *mapping)
3337{
3338 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3339 PyErr_BadArgument();
3340 return NULL;
3341 }
3342 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3343 PyUnicode_GET_SIZE(unicode),
3344 mapping,
3345 NULL);
3346}
3347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003348/* create or adjust a UnicodeTranslateError */
3349static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003350 const Py_UNICODE *unicode, Py_ssize_t size,
3351 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003352 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003354 if (*exceptionObject == NULL) {
3355 *exceptionObject = PyUnicodeTranslateError_Create(
3356 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357 }
3358 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003359 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3360 goto onError;
3361 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3362 goto onError;
3363 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3364 goto onError;
3365 return;
3366 onError:
3367 Py_DECREF(*exceptionObject);
3368 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369 }
3370}
3371
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003372/* raises a UnicodeTranslateError */
3373static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003374 const Py_UNICODE *unicode, Py_ssize_t size,
3375 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003376 const char *reason)
3377{
3378 make_translate_exception(exceptionObject,
3379 unicode, size, startpos, endpos, reason);
3380 if (*exceptionObject != NULL)
3381 PyCodec_StrictErrors(*exceptionObject);
3382}
3383
3384/* error handling callback helper:
3385 build arguments, call the callback and check the arguments,
3386 put the result into newpos and return the replacement string, which
3387 has to be freed by the caller */
3388static PyObject *unicode_translate_call_errorhandler(const char *errors,
3389 PyObject **errorHandler,
3390 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003391 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3392 Py_ssize_t startpos, Py_ssize_t endpos,
3393 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003394{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003395 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003396
Martin v. Löwis412fb672006-04-13 06:34:32 +00003397 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003398 PyObject *restuple;
3399 PyObject *resunicode;
3400
3401 if (*errorHandler == NULL) {
3402 *errorHandler = PyCodec_LookupError(errors);
3403 if (*errorHandler == NULL)
3404 return NULL;
3405 }
3406
3407 make_translate_exception(exceptionObject,
3408 unicode, size, startpos, endpos, reason);
3409 if (*exceptionObject == NULL)
3410 return NULL;
3411
3412 restuple = PyObject_CallFunctionObjArgs(
3413 *errorHandler, *exceptionObject, NULL);
3414 if (restuple == NULL)
3415 return NULL;
3416 if (!PyTuple_Check(restuple)) {
3417 PyErr_Format(PyExc_TypeError, &argparse[4]);
3418 Py_DECREF(restuple);
3419 return NULL;
3420 }
3421 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003422 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003423 Py_DECREF(restuple);
3424 return NULL;
3425 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003426 if (i_newpos<0)
3427 *newpos = size+i_newpos;
3428 else
3429 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003430 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003431 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003432 Py_DECREF(restuple);
3433 return NULL;
3434 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003435 Py_INCREF(resunicode);
3436 Py_DECREF(restuple);
3437 return resunicode;
3438}
3439
3440/* Lookup the character ch in the mapping and put the result in result,
3441 which must be decrefed by the caller.
3442 Return 0 on success, -1 on error */
3443static
3444int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3445{
3446 PyObject *w = PyInt_FromLong((long)c);
3447 PyObject *x;
3448
3449 if (w == NULL)
3450 return -1;
3451 x = PyObject_GetItem(mapping, w);
3452 Py_DECREF(w);
3453 if (x == NULL) {
3454 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3455 /* No mapping found means: use 1:1 mapping. */
3456 PyErr_Clear();
3457 *result = NULL;
3458 return 0;
3459 } else
3460 return -1;
3461 }
3462 else if (x == Py_None) {
3463 *result = x;
3464 return 0;
3465 }
3466 else if (PyInt_Check(x)) {
3467 long value = PyInt_AS_LONG(x);
3468 long max = PyUnicode_GetMax();
3469 if (value < 0 || value > max) {
3470 PyErr_Format(PyExc_TypeError,
3471 "character mapping must be in range(0x%lx)", max+1);
3472 Py_DECREF(x);
3473 return -1;
3474 }
3475 *result = x;
3476 return 0;
3477 }
3478 else if (PyUnicode_Check(x)) {
3479 *result = x;
3480 return 0;
3481 }
3482 else {
3483 /* wrong return value */
3484 PyErr_SetString(PyExc_TypeError,
3485 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003486 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487 return -1;
3488 }
3489}
3490/* ensure that *outobj is at least requiredsize characters long,
3491if not reallocate and adjust various state variables.
3492Return 0 on success, -1 on error */
3493static
Walter Dörwald4894c302003-10-24 14:25:28 +00003494int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003495 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003497 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003498 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003499 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003500 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003501 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003502 if (requiredsize < 2 * oldsize)
3503 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003504 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505 return -1;
3506 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507 }
3508 return 0;
3509}
3510/* lookup the character, put the result in the output string and adjust
3511 various state variables. Return a new reference to the object that
3512 was put in the output buffer in *result, or Py_None, if the mapping was
3513 undefined (in which case no character was written).
3514 The called must decref result.
3515 Return 0 on success, -1 on error. */
3516static
Walter Dörwald4894c302003-10-24 14:25:28 +00003517int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003518 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003519 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520{
Walter Dörwald4894c302003-10-24 14:25:28 +00003521 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 return -1;
3523 if (*res==NULL) {
3524 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003525 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526 }
3527 else if (*res==Py_None)
3528 ;
3529 else if (PyInt_Check(*res)) {
3530 /* no overflow check, because we know that the space is enough */
3531 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3532 }
3533 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003534 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 if (repsize==1) {
3536 /* no overflow check, because we know that the space is enough */
3537 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3538 }
3539 else if (repsize!=0) {
3540 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003541 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003542 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003543 repsize - 1;
3544 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 return -1;
3546 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3547 *outp += repsize;
3548 }
3549 }
3550 else
3551 return -1;
3552 return 0;
3553}
3554
3555PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003556 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003557 PyObject *mapping,
3558 const char *errors)
3559{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 /* output object */
3561 PyObject *res = NULL;
3562 /* pointers to the beginning and end+1 of input */
3563 const Py_UNICODE *startp = p;
3564 const Py_UNICODE *endp = p + size;
3565 /* pointer into the output */
3566 Py_UNICODE *str;
3567 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003568 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569 char *reason = "character maps to <undefined>";
3570 PyObject *errorHandler = NULL;
3571 PyObject *exc = NULL;
3572 /* the following variable is used for caching string comparisons
3573 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3574 * 3=ignore, 4=xmlcharrefreplace */
3575 int known_errorHandler = -1;
3576
Guido van Rossumd57fd912000-03-10 22:53:23 +00003577 if (mapping == NULL) {
3578 PyErr_BadArgument();
3579 return NULL;
3580 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581
3582 /* allocate enough for a simple 1:1 translation without
3583 replacements, if we need more, we'll resize */
3584 res = PyUnicode_FromUnicode(NULL, size);
3585 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003586 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 return res;
3589 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003591 while (p<endp) {
3592 /* try to encode it */
3593 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003594 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003595 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596 goto onError;
3597 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003598 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 if (x!=Py_None) /* it worked => adjust input pointer */
3600 ++p;
3601 else { /* untranslatable character */
3602 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003603 Py_ssize_t repsize;
3604 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 Py_UNICODE *uni2;
3606 /* startpos for collecting untranslatable chars */
3607 const Py_UNICODE *collstart = p;
3608 const Py_UNICODE *collend = p+1;
3609 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 /* find all untranslatable characters */
3612 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003613 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614 goto onError;
3615 Py_XDECREF(x);
3616 if (x!=Py_None)
3617 break;
3618 ++collend;
3619 }
3620 /* cache callback name lookup
3621 * (if not done yet, i.e. it's the first error) */
3622 if (known_errorHandler==-1) {
3623 if ((errors==NULL) || (!strcmp(errors, "strict")))
3624 known_errorHandler = 1;
3625 else if (!strcmp(errors, "replace"))
3626 known_errorHandler = 2;
3627 else if (!strcmp(errors, "ignore"))
3628 known_errorHandler = 3;
3629 else if (!strcmp(errors, "xmlcharrefreplace"))
3630 known_errorHandler = 4;
3631 else
3632 known_errorHandler = 0;
3633 }
3634 switch (known_errorHandler) {
3635 case 1: /* strict */
3636 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3637 goto onError;
3638 case 2: /* replace */
3639 /* No need to check for space, this is a 1:1 replacement */
3640 for (coll = collstart; coll<collend; ++coll)
3641 *str++ = '?';
3642 /* fall through */
3643 case 3: /* ignore */
3644 p = collend;
3645 break;
3646 case 4: /* xmlcharrefreplace */
3647 /* generate replacement (temporarily (mis)uses p) */
3648 for (p = collstart; p < collend; ++p) {
3649 char buffer[2+29+1+1];
3650 char *cp;
3651 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003652 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3654 goto onError;
3655 for (cp = buffer; *cp; ++cp)
3656 *str++ = *cp;
3657 }
3658 p = collend;
3659 break;
3660 default:
3661 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3662 reason, startp, size, &exc,
3663 collstart-startp, collend-startp, &newpos);
3664 if (repunicode == NULL)
3665 goto onError;
3666 /* generate replacement */
3667 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003668 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3670 Py_DECREF(repunicode);
3671 goto onError;
3672 }
3673 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3674 *str++ = *uni2;
3675 p = startp + newpos;
3676 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 }
3678 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 /* Resize if we allocated to much */
3681 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003682 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003683 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003684 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 }
3686 Py_XDECREF(exc);
3687 Py_XDECREF(errorHandler);
3688 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003690 onError:
3691 Py_XDECREF(res);
3692 Py_XDECREF(exc);
3693 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694 return NULL;
3695}
3696
3697PyObject *PyUnicode_Translate(PyObject *str,
3698 PyObject *mapping,
3699 const char *errors)
3700{
3701 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003702
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703 str = PyUnicode_FromObject(str);
3704 if (str == NULL)
3705 goto onError;
3706 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3707 PyUnicode_GET_SIZE(str),
3708 mapping,
3709 errors);
3710 Py_DECREF(str);
3711 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003712
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 onError:
3714 Py_XDECREF(str);
3715 return NULL;
3716}
Tim Petersced69f82003-09-16 20:30:58 +00003717
Guido van Rossum9e896b32000-04-05 20:11:21 +00003718/* --- Decimal Encoder ---------------------------------------------------- */
3719
3720int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003721 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00003722 char *output,
3723 const char *errors)
3724{
3725 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 PyObject *errorHandler = NULL;
3727 PyObject *exc = NULL;
3728 const char *encoding = "decimal";
3729 const char *reason = "invalid decimal Unicode string";
3730 /* the following variable is used for caching string comparisons
3731 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3732 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003733
3734 if (output == NULL) {
3735 PyErr_BadArgument();
3736 return -1;
3737 }
3738
3739 p = s;
3740 end = s + length;
3741 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003742 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003743 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003744 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003745 Py_ssize_t repsize;
3746 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003747 Py_UNICODE *uni2;
3748 Py_UNICODE *collstart;
3749 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003750
Guido van Rossum9e896b32000-04-05 20:11:21 +00003751 if (Py_UNICODE_ISSPACE(ch)) {
3752 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003753 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003754 continue;
3755 }
3756 decimal = Py_UNICODE_TODECIMAL(ch);
3757 if (decimal >= 0) {
3758 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003759 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003760 continue;
3761 }
Guido van Rossumba477042000-04-06 18:18:10 +00003762 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003763 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003765 continue;
3766 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003767 /* All other characters are considered unencodable */
3768 collstart = p;
3769 collend = p+1;
3770 while (collend < end) {
3771 if ((0 < *collend && *collend < 256) ||
3772 !Py_UNICODE_ISSPACE(*collend) ||
3773 Py_UNICODE_TODECIMAL(*collend))
3774 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003775 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003776 /* cache callback name lookup
3777 * (if not done yet, i.e. it's the first error) */
3778 if (known_errorHandler==-1) {
3779 if ((errors==NULL) || (!strcmp(errors, "strict")))
3780 known_errorHandler = 1;
3781 else if (!strcmp(errors, "replace"))
3782 known_errorHandler = 2;
3783 else if (!strcmp(errors, "ignore"))
3784 known_errorHandler = 3;
3785 else if (!strcmp(errors, "xmlcharrefreplace"))
3786 known_errorHandler = 4;
3787 else
3788 known_errorHandler = 0;
3789 }
3790 switch (known_errorHandler) {
3791 case 1: /* strict */
3792 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3793 goto onError;
3794 case 2: /* replace */
3795 for (p = collstart; p < collend; ++p)
3796 *output++ = '?';
3797 /* fall through */
3798 case 3: /* ignore */
3799 p = collend;
3800 break;
3801 case 4: /* xmlcharrefreplace */
3802 /* generate replacement (temporarily (mis)uses p) */
3803 for (p = collstart; p < collend; ++p)
3804 output += sprintf(output, "&#%d;", (int)*p);
3805 p = collend;
3806 break;
3807 default:
3808 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3809 encoding, reason, s, length, &exc,
3810 collstart-s, collend-s, &newpos);
3811 if (repunicode == NULL)
3812 goto onError;
3813 /* generate replacement */
3814 repsize = PyUnicode_GET_SIZE(repunicode);
3815 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3816 Py_UNICODE ch = *uni2;
3817 if (Py_UNICODE_ISSPACE(ch))
3818 *output++ = ' ';
3819 else {
3820 decimal = Py_UNICODE_TODECIMAL(ch);
3821 if (decimal >= 0)
3822 *output++ = '0' + decimal;
3823 else if (0 < ch && ch < 256)
3824 *output++ = (char)ch;
3825 else {
3826 Py_DECREF(repunicode);
3827 raise_encode_exception(&exc, encoding,
3828 s, length, collstart-s, collend-s, reason);
3829 goto onError;
3830 }
3831 }
3832 }
3833 p = s + newpos;
3834 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003835 }
3836 }
3837 /* 0-terminate the output string */
3838 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003839 Py_XDECREF(exc);
3840 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003841 return 0;
3842
3843 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003844 Py_XDECREF(exc);
3845 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003846 return -1;
3847}
3848
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849/* --- Helpers ------------------------------------------------------------ */
3850
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003851#define USE_FAST /* experimental fast search implementation */
3852
3853/* fast search/count implementation, based on a mix between boyer-
3854 moore and horspool, with a few more bells and whistles on the top.
3855 for some more background, see: http://effbot.org/stringlib */
3856
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00003857/* note: fastsearch may access s[n], which isn't a problem when using
3858 Python's ordinary string types. also, the count mode returns -1 if
3859 there cannot possible be a match in the target string, and 0 if it
3860 has actually checked for matches. */
3861
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003862#define FAST_COUNT 0
3863#define FAST_SEARCH 1
3864
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00003865LOCAL(Py_ssize_t)
3866fastsearch(Py_UNICODE* s, Py_ssize_t n, Py_UNICODE* p, Py_ssize_t m, int mode)
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003867{
3868 long mask;
3869 int skip, count = 0;
3870 Py_ssize_t i, j, mlast, w;
3871
3872 w = n - m;
3873
3874 if (w < 0)
3875 return -1;
3876
3877 /* look for special cases */
3878 if (m <= 1) {
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00003879 if (m <= 0)
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003880 return -1;
3881 /* use special case for 1-character strings */
3882 if (mode == FAST_COUNT) {
3883 for (i = 0; i < n; i++)
3884 if (s[i] == p[0])
3885 count++;
3886 return count;
3887 } else {
3888 for (i = 0; i < n; i++)
3889 if (s[i] == p[0])
3890 return i;
3891 }
3892 return -1;
3893 }
3894
3895 mlast = m - 1;
3896
3897 /* create compressed boyer-moore delta 1 table */
3898 skip = mlast - 1;
3899 /* process pattern[:-1] */
3900 for (mask = i = 0; i < mlast; i++) {
3901 mask |= (1 << (p[i] & 0x1F));
3902 if (p[i] == p[mlast])
3903 skip = mlast - i - 1;
3904 }
3905 /* process pattern[-1] outside the loop */
3906 mask |= (1 << (p[mlast] & 0x1F));
3907
3908 for (i = 0; i <= w; i++) {
3909 /* note: using mlast in the skip path slows things down on x86 */
3910 if (s[i+m-1] == p[m-1]) {
3911 /* candidate match */
3912 for (j = 0; j < mlast; j++)
3913 if (s[i+j] != p[j])
3914 break;
3915 if (j == mlast) {
3916 /* got a match! */
3917 if (mode != FAST_COUNT)
3918 return i;
3919 count++;
3920 i = i + mlast;
3921 continue;
3922 }
3923 /* miss: check if next character is part of pattern */
3924 if (!(mask & (1 << (s[i+m] & 0x1F))))
3925 i = i + m;
3926 else {
3927 i = i + skip;
3928 continue;
3929 }
3930 } else {
3931 /* skip: check if next character is part of pattern */
3932 if (!(mask & (1 << (s[i+m] & 0x1F))))
3933 i = i + m;
3934 }
3935 }
3936
3937 if (mode != FAST_COUNT)
3938 return -1;
3939 return count;
3940}
3941
3942LOCAL(Py_ssize_t) count(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003943 Py_ssize_t start,
3944 Py_ssize_t end,
3945 PyUnicodeObject *substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003946{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003947 Py_ssize_t count = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003949 if (start < 0)
3950 start += self->length;
3951 if (start < 0)
3952 start = 0;
3953 if (end > self->length)
3954 end = self->length;
3955 if (end < 0)
3956 end += self->length;
3957 if (end < 0)
3958 end = 0;
3959
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003960 if (substring->length == 0)
3961 return (end - start + 1);
3962
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003963#ifdef USE_FAST
3964 count = fastsearch(
3965 PyUnicode_AS_UNICODE(self) + start, end - start,
3966 substring->str, substring->length, FAST_COUNT
3967 );
3968 if (count < 0)
3969 count = 0; /* no match */
3970#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971 end -= substring->length;
3972
3973 while (start <= end)
3974 if (Py_UNICODE_MATCH(self, start, substring)) {
3975 count++;
3976 start += substring->length;
3977 } else
3978 start++;
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003979#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980
3981 return count;
3982}
3983
Martin v. Löwis18e16552006-02-15 17:27:45 +00003984Py_ssize_t PyUnicode_Count(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003986 Py_ssize_t start,
3987 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003989 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003990
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991 str = PyUnicode_FromObject(str);
3992 if (str == NULL)
3993 return -1;
3994 substr = PyUnicode_FromObject(substr);
3995 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003996 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997 return -1;
3998 }
Tim Petersced69f82003-09-16 20:30:58 +00003999
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 result = count((PyUnicodeObject *)str,
4001 start, end,
4002 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00004003
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 Py_DECREF(str);
4005 Py_DECREF(substr);
4006 return result;
4007}
4008
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004009static Py_ssize_t findstring(PyUnicodeObject *self,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004011 Py_ssize_t start,
4012 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004013 int direction)
4014{
4015 if (start < 0)
4016 start += self->length;
4017 if (start < 0)
4018 start = 0;
4019
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020 if (end > self->length)
4021 end = self->length;
4022 if (end < 0)
4023 end += self->length;
4024 if (end < 0)
4025 end = 0;
4026
Guido van Rossum76afbd92002-08-20 17:29:29 +00004027 if (substring->length == 0)
4028 return (direction > 0) ? start : end;
4029
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004030#ifdef USE_FAST
4031 if (direction > 0) {
4032 Py_ssize_t pos = fastsearch(
4033 PyUnicode_AS_UNICODE(self) + start, end - start,
4034 substring->str, substring->length, FAST_SEARCH
4035 );
4036 if (pos < 0)
4037 return pos;
4038 return pos + start;
4039 }
4040#endif
4041
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042 end -= substring->length;
4043
4044 if (direction < 0) {
4045 for (; end >= start; end--)
4046 if (Py_UNICODE_MATCH(self, end, substring))
4047 return end;
4048 } else {
4049 for (; start <= end; start++)
4050 if (Py_UNICODE_MATCH(self, start, substring))
4051 return start;
4052 }
4053
4054 return -1;
4055}
4056
Martin v. Löwis18e16552006-02-15 17:27:45 +00004057Py_ssize_t PyUnicode_Find(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004059 Py_ssize_t start,
4060 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061 int direction)
4062{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004063 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004064
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065 str = PyUnicode_FromObject(str);
4066 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004067 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068 substr = PyUnicode_FromObject(substr);
4069 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004070 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004071 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 }
Tim Petersced69f82003-09-16 20:30:58 +00004073
Guido van Rossumd57fd912000-03-10 22:53:23 +00004074 result = findstring((PyUnicodeObject *)str,
4075 (PyUnicodeObject *)substr,
4076 start, end, direction);
4077 Py_DECREF(str);
4078 Py_DECREF(substr);
4079 return result;
4080}
4081
Tim Petersced69f82003-09-16 20:30:58 +00004082static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083int tailmatch(PyUnicodeObject *self,
4084 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004085 Py_ssize_t start,
4086 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087 int direction)
4088{
4089 if (start < 0)
4090 start += self->length;
4091 if (start < 0)
4092 start = 0;
4093
4094 if (substring->length == 0)
4095 return 1;
4096
4097 if (end > self->length)
4098 end = self->length;
4099 if (end < 0)
4100 end += self->length;
4101 if (end < 0)
4102 end = 0;
4103
4104 end -= substring->length;
4105 if (end < start)
4106 return 0;
4107
4108 if (direction > 0) {
4109 if (Py_UNICODE_MATCH(self, end, substring))
4110 return 1;
4111 } else {
4112 if (Py_UNICODE_MATCH(self, start, substring))
4113 return 1;
4114 }
4115
4116 return 0;
4117}
4118
Martin v. Löwis18e16552006-02-15 17:27:45 +00004119Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004121 Py_ssize_t start,
4122 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123 int direction)
4124{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004125 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004126
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127 str = PyUnicode_FromObject(str);
4128 if (str == NULL)
4129 return -1;
4130 substr = PyUnicode_FromObject(substr);
4131 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004132 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133 return -1;
4134 }
Tim Petersced69f82003-09-16 20:30:58 +00004135
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136 result = tailmatch((PyUnicodeObject *)str,
4137 (PyUnicodeObject *)substr,
4138 start, end, direction);
4139 Py_DECREF(str);
4140 Py_DECREF(substr);
4141 return result;
4142}
4143
Tim Petersced69f82003-09-16 20:30:58 +00004144static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004146 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147 Py_UNICODE ch)
4148{
4149 /* like wcschr, but doesn't stop at NULL characters */
4150
4151 while (size-- > 0) {
4152 if (*s == ch)
4153 return s;
4154 s++;
4155 }
4156
4157 return NULL;
4158}
4159
4160/* Apply fixfct filter to the Unicode object self and return a
4161 reference to the modified object */
4162
Tim Petersced69f82003-09-16 20:30:58 +00004163static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004164PyObject *fixup(PyUnicodeObject *self,
4165 int (*fixfct)(PyUnicodeObject *s))
4166{
4167
4168 PyUnicodeObject *u;
4169
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004170 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171 if (u == NULL)
4172 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004173
4174 Py_UNICODE_COPY(u->str, self->str, self->length);
4175
Tim Peters7a29bd52001-09-12 03:03:31 +00004176 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 /* fixfct should return TRUE if it modified the buffer. If
4178 FALSE, return a reference to the original buffer instead
4179 (to save space, not time) */
4180 Py_INCREF(self);
4181 Py_DECREF(u);
4182 return (PyObject*) self;
4183 }
4184 return (PyObject*) u;
4185}
4186
Tim Petersced69f82003-09-16 20:30:58 +00004187static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188int fixupper(PyUnicodeObject *self)
4189{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004190 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191 Py_UNICODE *s = self->str;
4192 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004193
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194 while (len-- > 0) {
4195 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004196
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197 ch = Py_UNICODE_TOUPPER(*s);
4198 if (ch != *s) {
4199 status = 1;
4200 *s = ch;
4201 }
4202 s++;
4203 }
4204
4205 return status;
4206}
4207
Tim Petersced69f82003-09-16 20:30:58 +00004208static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209int fixlower(PyUnicodeObject *self)
4210{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004211 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004212 Py_UNICODE *s = self->str;
4213 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004214
Guido van Rossumd57fd912000-03-10 22:53:23 +00004215 while (len-- > 0) {
4216 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004217
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218 ch = Py_UNICODE_TOLOWER(*s);
4219 if (ch != *s) {
4220 status = 1;
4221 *s = ch;
4222 }
4223 s++;
4224 }
4225
4226 return status;
4227}
4228
Tim Petersced69f82003-09-16 20:30:58 +00004229static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004230int fixswapcase(PyUnicodeObject *self)
4231{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004232 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004233 Py_UNICODE *s = self->str;
4234 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004235
Guido van Rossumd57fd912000-03-10 22:53:23 +00004236 while (len-- > 0) {
4237 if (Py_UNICODE_ISUPPER(*s)) {
4238 *s = Py_UNICODE_TOLOWER(*s);
4239 status = 1;
4240 } else if (Py_UNICODE_ISLOWER(*s)) {
4241 *s = Py_UNICODE_TOUPPER(*s);
4242 status = 1;
4243 }
4244 s++;
4245 }
4246
4247 return status;
4248}
4249
Tim Petersced69f82003-09-16 20:30:58 +00004250static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004251int fixcapitalize(PyUnicodeObject *self)
4252{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004253 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004254 Py_UNICODE *s = self->str;
4255 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004256
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004257 if (len == 0)
4258 return 0;
4259 if (Py_UNICODE_ISLOWER(*s)) {
4260 *s = Py_UNICODE_TOUPPER(*s);
4261 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004262 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004263 s++;
4264 while (--len > 0) {
4265 if (Py_UNICODE_ISUPPER(*s)) {
4266 *s = Py_UNICODE_TOLOWER(*s);
4267 status = 1;
4268 }
4269 s++;
4270 }
4271 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272}
4273
4274static
4275int fixtitle(PyUnicodeObject *self)
4276{
4277 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4278 register Py_UNICODE *e;
4279 int previous_is_cased;
4280
4281 /* Shortcut for single character strings */
4282 if (PyUnicode_GET_SIZE(self) == 1) {
4283 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4284 if (*p != ch) {
4285 *p = ch;
4286 return 1;
4287 }
4288 else
4289 return 0;
4290 }
Tim Petersced69f82003-09-16 20:30:58 +00004291
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292 e = p + PyUnicode_GET_SIZE(self);
4293 previous_is_cased = 0;
4294 for (; p < e; p++) {
4295 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004296
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 if (previous_is_cased)
4298 *p = Py_UNICODE_TOLOWER(ch);
4299 else
4300 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004301
4302 if (Py_UNICODE_ISLOWER(ch) ||
4303 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004304 Py_UNICODE_ISTITLE(ch))
4305 previous_is_cased = 1;
4306 else
4307 previous_is_cased = 0;
4308 }
4309 return 1;
4310}
4311
Tim Peters8ce9f162004-08-27 01:49:32 +00004312PyObject *
4313PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004314{
Tim Peters8ce9f162004-08-27 01:49:32 +00004315 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004316 const Py_UNICODE blank = ' ';
4317 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004318 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004319 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004320 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4321 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004322 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4323 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004324 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004325 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004326 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327
Tim Peters05eba1f2004-08-27 21:32:02 +00004328 fseq = PySequence_Fast(seq, "");
4329 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004330 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004331 }
4332
Tim Peters91879ab2004-08-27 22:35:44 +00004333 /* Grrrr. A codec may be invoked to convert str objects to
4334 * Unicode, and so it's possible to call back into Python code
4335 * during PyUnicode_FromObject(), and so it's possible for a sick
4336 * codec to change the size of fseq (if seq is a list). Therefore
4337 * we have to keep refetching the size -- can't assume seqlen
4338 * is invariant.
4339 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004340 seqlen = PySequence_Fast_GET_SIZE(fseq);
4341 /* If empty sequence, return u"". */
4342 if (seqlen == 0) {
4343 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4344 goto Done;
4345 }
4346 /* If singleton sequence with an exact Unicode, return that. */
4347 if (seqlen == 1) {
4348 item = PySequence_Fast_GET_ITEM(fseq, 0);
4349 if (PyUnicode_CheckExact(item)) {
4350 Py_INCREF(item);
4351 res = (PyUnicodeObject *)item;
4352 goto Done;
4353 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004354 }
4355
Tim Peters05eba1f2004-08-27 21:32:02 +00004356 /* At least two items to join, or one that isn't exact Unicode. */
4357 if (seqlen > 1) {
4358 /* Set up sep and seplen -- they're needed. */
4359 if (separator == NULL) {
4360 sep = &blank;
4361 seplen = 1;
4362 }
4363 else {
4364 internal_separator = PyUnicode_FromObject(separator);
4365 if (internal_separator == NULL)
4366 goto onError;
4367 sep = PyUnicode_AS_UNICODE(internal_separator);
4368 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004369 /* In case PyUnicode_FromObject() mutated seq. */
4370 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004371 }
4372 }
4373
4374 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004375 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004376 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004377 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004378 res_p = PyUnicode_AS_UNICODE(res);
4379 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004380
Tim Peters05eba1f2004-08-27 21:32:02 +00004381 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004382 Py_ssize_t itemlen;
4383 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004384
4385 item = PySequence_Fast_GET_ITEM(fseq, i);
4386 /* Convert item to Unicode. */
4387 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4388 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004389 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004390 " %.80s found",
4391 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004392 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004393 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004394 item = PyUnicode_FromObject(item);
4395 if (item == NULL)
4396 goto onError;
4397 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004398
Tim Peters91879ab2004-08-27 22:35:44 +00004399 /* In case PyUnicode_FromObject() mutated seq. */
4400 seqlen = PySequence_Fast_GET_SIZE(fseq);
4401
Tim Peters8ce9f162004-08-27 01:49:32 +00004402 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004404 new_res_used = res_used + itemlen;
Tim Peters286085c2006-05-22 19:17:04 +00004405 if (new_res_used <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004406 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004407 if (i < seqlen - 1) {
4408 new_res_used += seplen;
Tim Peters286085c2006-05-22 19:17:04 +00004409 if (new_res_used <= 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004410 goto Overflow;
4411 }
4412 if (new_res_used > res_alloc) {
4413 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004414 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004415 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004416 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004417 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004418 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004419 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004420 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004422 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004423 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004425
4426 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004427 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004428 res_p += itemlen;
4429 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004430 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004431 res_p += seplen;
4432 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004434 res_used = new_res_used;
4435 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004436
Tim Peters05eba1f2004-08-27 21:32:02 +00004437 /* Shrink res to match the used area; this probably can't fail,
4438 * but it's cheap to check.
4439 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004440 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004441 goto onError;
4442
4443 Done:
4444 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004445 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446 return (PyObject *)res;
4447
Tim Peters8ce9f162004-08-27 01:49:32 +00004448 Overflow:
4449 PyErr_SetString(PyExc_OverflowError,
4450 "join() is too long for a Python string");
4451 Py_DECREF(item);
4452 /* fall through */
4453
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004455 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004456 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004457 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 return NULL;
4459}
4460
Tim Petersced69f82003-09-16 20:30:58 +00004461static
4462PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004463 Py_ssize_t left,
4464 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465 Py_UNICODE fill)
4466{
4467 PyUnicodeObject *u;
4468
4469 if (left < 0)
4470 left = 0;
4471 if (right < 0)
4472 right = 0;
4473
Tim Peters7a29bd52001-09-12 03:03:31 +00004474 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475 Py_INCREF(self);
4476 return self;
4477 }
4478
4479 u = _PyUnicode_New(left + self->length + right);
4480 if (u) {
4481 if (left)
4482 Py_UNICODE_FILL(u->str, fill, left);
4483 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4484 if (right)
4485 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4486 }
4487
4488 return u;
4489}
4490
4491#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004492 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493 if (!str) \
4494 goto onError; \
4495 if (PyList_Append(list, str)) { \
4496 Py_DECREF(str); \
4497 goto onError; \
4498 } \
4499 else \
4500 Py_DECREF(str);
4501
4502static
4503PyObject *split_whitespace(PyUnicodeObject *self,
4504 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004505 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004507 register Py_ssize_t i;
4508 register Py_ssize_t j;
4509 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510 PyObject *str;
4511
4512 for (i = j = 0; i < len; ) {
4513 /* find a token */
4514 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4515 i++;
4516 j = i;
4517 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4518 i++;
4519 if (j < i) {
4520 if (maxcount-- <= 0)
4521 break;
4522 SPLIT_APPEND(self->str, j, i);
4523 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4524 i++;
4525 j = i;
4526 }
4527 }
4528 if (j < len) {
4529 SPLIT_APPEND(self->str, j, len);
4530 }
4531 return list;
4532
4533 onError:
4534 Py_DECREF(list);
4535 return NULL;
4536}
4537
4538PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004539 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004541 register Py_ssize_t i;
4542 register Py_ssize_t j;
4543 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004544 PyObject *list;
4545 PyObject *str;
4546 Py_UNICODE *data;
4547
4548 string = PyUnicode_FromObject(string);
4549 if (string == NULL)
4550 return NULL;
4551 data = PyUnicode_AS_UNICODE(string);
4552 len = PyUnicode_GET_SIZE(string);
4553
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554 list = PyList_New(0);
4555 if (!list)
4556 goto onError;
4557
4558 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004559 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004560
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004562 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004564
4565 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004566 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567 if (i < len) {
4568 if (data[i] == '\r' && i + 1 < len &&
4569 data[i+1] == '\n')
4570 i += 2;
4571 else
4572 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004573 if (keepends)
4574 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575 }
Guido van Rossum86662912000-04-11 15:38:46 +00004576 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004577 j = i;
4578 }
4579 if (j < len) {
4580 SPLIT_APPEND(data, j, len);
4581 }
4582
4583 Py_DECREF(string);
4584 return list;
4585
4586 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004587 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588 Py_DECREF(string);
4589 return NULL;
4590}
4591
Tim Petersced69f82003-09-16 20:30:58 +00004592static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004593PyObject *split_char(PyUnicodeObject *self,
4594 PyObject *list,
4595 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004596 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004598 register Py_ssize_t i;
4599 register Py_ssize_t j;
4600 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004601 PyObject *str;
4602
4603 for (i = j = 0; i < len; ) {
4604 if (self->str[i] == ch) {
4605 if (maxcount-- <= 0)
4606 break;
4607 SPLIT_APPEND(self->str, j, i);
4608 i = j = i + 1;
4609 } else
4610 i++;
4611 }
4612 if (j <= len) {
4613 SPLIT_APPEND(self->str, j, len);
4614 }
4615 return list;
4616
4617 onError:
4618 Py_DECREF(list);
4619 return NULL;
4620}
4621
Tim Petersced69f82003-09-16 20:30:58 +00004622static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623PyObject *split_substring(PyUnicodeObject *self,
4624 PyObject *list,
4625 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004626 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004628 register Py_ssize_t i;
4629 register Py_ssize_t j;
4630 Py_ssize_t len = self->length;
4631 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632 PyObject *str;
4633
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004634 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635 if (Py_UNICODE_MATCH(self, i, substring)) {
4636 if (maxcount-- <= 0)
4637 break;
4638 SPLIT_APPEND(self->str, j, i);
4639 i = j = i + sublen;
4640 } else
4641 i++;
4642 }
4643 if (j <= len) {
4644 SPLIT_APPEND(self->str, j, len);
4645 }
4646 return list;
4647
4648 onError:
4649 Py_DECREF(list);
4650 return NULL;
4651}
4652
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004653static
4654PyObject *rsplit_whitespace(PyUnicodeObject *self,
4655 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004656 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004657{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004658 register Py_ssize_t i;
4659 register Py_ssize_t j;
4660 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004661 PyObject *str;
4662
4663 for (i = j = len - 1; i >= 0; ) {
4664 /* find a token */
4665 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4666 i--;
4667 j = i;
4668 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4669 i--;
4670 if (j > i) {
4671 if (maxcount-- <= 0)
4672 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004673 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004674 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4675 i--;
4676 j = i;
4677 }
4678 }
4679 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004680 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004681 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004682 if (PyList_Reverse(list) < 0)
4683 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004684 return list;
4685
4686 onError:
4687 Py_DECREF(list);
4688 return NULL;
4689}
4690
4691static
4692PyObject *rsplit_char(PyUnicodeObject *self,
4693 PyObject *list,
4694 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004695 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004696{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004697 register Py_ssize_t i;
4698 register Py_ssize_t j;
4699 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004700 PyObject *str;
4701
4702 for (i = j = len - 1; i >= 0; ) {
4703 if (self->str[i] == ch) {
4704 if (maxcount-- <= 0)
4705 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004706 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004707 j = i = i - 1;
4708 } else
4709 i--;
4710 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004711 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004712 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004713 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004714 if (PyList_Reverse(list) < 0)
4715 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004716 return list;
4717
4718 onError:
4719 Py_DECREF(list);
4720 return NULL;
4721}
4722
4723static
4724PyObject *rsplit_substring(PyUnicodeObject *self,
4725 PyObject *list,
4726 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004727 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004728{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004729 register Py_ssize_t i;
4730 register Py_ssize_t j;
4731 Py_ssize_t len = self->length;
4732 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004733 PyObject *str;
4734
4735 for (i = len - sublen, j = len; i >= 0; ) {
4736 if (Py_UNICODE_MATCH(self, i, substring)) {
4737 if (maxcount-- <= 0)
4738 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004739 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004740 j = i;
4741 i -= sublen;
4742 } else
4743 i--;
4744 }
4745 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004746 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004747 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004748 if (PyList_Reverse(list) < 0)
4749 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004750 return list;
4751
4752 onError:
4753 Py_DECREF(list);
4754 return NULL;
4755}
4756
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757#undef SPLIT_APPEND
4758
4759static
4760PyObject *split(PyUnicodeObject *self,
4761 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004762 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763{
4764 PyObject *list;
4765
4766 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004767 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768
4769 list = PyList_New(0);
4770 if (!list)
4771 return NULL;
4772
4773 if (substring == NULL)
4774 return split_whitespace(self,list,maxcount);
4775
4776 else if (substring->length == 1)
4777 return split_char(self,list,substring->str[0],maxcount);
4778
4779 else if (substring->length == 0) {
4780 Py_DECREF(list);
4781 PyErr_SetString(PyExc_ValueError, "empty separator");
4782 return NULL;
4783 }
4784 else
4785 return split_substring(self,list,substring,maxcount);
4786}
4787
Tim Petersced69f82003-09-16 20:30:58 +00004788static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004789PyObject *rsplit(PyUnicodeObject *self,
4790 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004791 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004792{
4793 PyObject *list;
4794
4795 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004796 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004797
4798 list = PyList_New(0);
4799 if (!list)
4800 return NULL;
4801
4802 if (substring == NULL)
4803 return rsplit_whitespace(self,list,maxcount);
4804
4805 else if (substring->length == 1)
4806 return rsplit_char(self,list,substring->str[0],maxcount);
4807
4808 else if (substring->length == 0) {
4809 Py_DECREF(list);
4810 PyErr_SetString(PyExc_ValueError, "empty separator");
4811 return NULL;
4812 }
4813 else
4814 return rsplit_substring(self,list,substring,maxcount);
4815}
4816
4817static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818PyObject *replace(PyUnicodeObject *self,
4819 PyUnicodeObject *str1,
4820 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004821 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822{
4823 PyUnicodeObject *u;
4824
4825 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004826 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827
4828 if (str1->length == 1 && str2->length == 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004829 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830
4831 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004832 if (!findchar(self->str, self->length, str1->str[0]) &&
4833 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834 /* nothing to replace, return original string */
4835 Py_INCREF(self);
4836 u = self;
4837 } else {
4838 Py_UNICODE u1 = str1->str[0];
4839 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004840
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004842 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 self->length
4844 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004845 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004846 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004847 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 for (i = 0; i < u->length; i++)
4849 if (u->str[i] == u1) {
4850 if (--maxcount < 0)
4851 break;
4852 u->str[i] = u2;
4853 }
4854 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004855 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856
4857 } else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004858 Py_ssize_t n, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859 Py_UNICODE *p;
4860
4861 /* replace strings */
4862 n = count(self, 0, self->length, str1);
4863 if (n > maxcount)
4864 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004865 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004867 if (PyUnicode_CheckExact(self)) {
4868 Py_INCREF(self);
4869 u = self;
4870 }
4871 else {
4872 u = (PyUnicodeObject *)
4873 PyUnicode_FromUnicode(self->str, self->length);
4874 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 } else {
4876 u = _PyUnicode_New(
4877 self->length + n * (str2->length - str1->length));
4878 if (u) {
4879 i = 0;
4880 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004881 if (str1->length > 0) {
4882 while (i <= self->length - str1->length)
4883 if (Py_UNICODE_MATCH(self, i, str1)) {
4884 /* replace string segment */
4885 Py_UNICODE_COPY(p, str2->str, str2->length);
4886 p += str2->length;
4887 i += str1->length;
4888 if (--n <= 0) {
4889 /* copy remaining part */
4890 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4891 break;
4892 }
4893 } else
4894 *p++ = self->str[i++];
4895 } else {
4896 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897 Py_UNICODE_COPY(p, str2->str, str2->length);
4898 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004899 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004902 }
4903 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4904 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905 }
4906 }
4907 }
Tim Petersced69f82003-09-16 20:30:58 +00004908
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909 return (PyObject *) u;
4910}
4911
4912/* --- Unicode Object Methods --------------------------------------------- */
4913
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004914PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915"S.title() -> unicode\n\
4916\n\
4917Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004918characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919
4920static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004921unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923 return fixup(self, fixtitle);
4924}
4925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004926PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927"S.capitalize() -> unicode\n\
4928\n\
4929Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004930have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931
4932static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004933unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 return fixup(self, fixcapitalize);
4936}
4937
4938#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004939PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940"S.capwords() -> unicode\n\
4941\n\
4942Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004943normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944
4945static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004946unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947{
4948 PyObject *list;
4949 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004950 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004951
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952 /* Split into words */
4953 list = split(self, NULL, -1);
4954 if (!list)
4955 return NULL;
4956
4957 /* Capitalize each word */
4958 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4959 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4960 fixcapitalize);
4961 if (item == NULL)
4962 goto onError;
4963 Py_DECREF(PyList_GET_ITEM(list, i));
4964 PyList_SET_ITEM(list, i, item);
4965 }
4966
4967 /* Join the words to form a new string */
4968 item = PyUnicode_Join(NULL, list);
4969
4970onError:
4971 Py_DECREF(list);
4972 return (PyObject *)item;
4973}
4974#endif
4975
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004976/* Argument converter. Coerces to a single unicode character */
4977
4978static int
4979convert_uc(PyObject *obj, void *addr)
4980{
4981 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4982 PyObject *uniobj;
4983 Py_UNICODE *unistr;
4984
4985 uniobj = PyUnicode_FromObject(obj);
4986 if (uniobj == NULL) {
4987 PyErr_SetString(PyExc_TypeError,
4988 "The fill character cannot be converted to Unicode");
4989 return 0;
4990 }
4991 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4992 PyErr_SetString(PyExc_TypeError,
4993 "The fill character must be exactly one character long");
4994 Py_DECREF(uniobj);
4995 return 0;
4996 }
4997 unistr = PyUnicode_AS_UNICODE(uniobj);
4998 *fillcharloc = unistr[0];
4999 Py_DECREF(uniobj);
5000 return 1;
5001}
5002
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005003PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005004"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005006Return S centered in a Unicode string of length width. Padding is\n\
5007done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005008
5009static PyObject *
5010unicode_center(PyUnicodeObject *self, PyObject *args)
5011{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005012 Py_ssize_t marg, left;
5013 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005014 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005015
Thomas Woutersde017742006-02-16 19:34:37 +00005016 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005017 return NULL;
5018
Tim Peters7a29bd52001-09-12 03:03:31 +00005019 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020 Py_INCREF(self);
5021 return (PyObject*) self;
5022 }
5023
5024 marg = width - self->length;
5025 left = marg / 2 + (marg & width & 1);
5026
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005027 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028}
5029
Marc-André Lemburge5034372000-08-08 08:04:29 +00005030#if 0
5031
5032/* This code should go into some future Unicode collation support
5033 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005034 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005035
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005036/* speedy UTF-16 code point order comparison */
5037/* gleaned from: */
5038/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5039
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005040static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005041{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005042 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005043 0, 0, 0, 0, 0, 0, 0, 0,
5044 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005045 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005046};
5047
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048static int
5049unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5050{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005051 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005052
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053 Py_UNICODE *s1 = str1->str;
5054 Py_UNICODE *s2 = str2->str;
5055
5056 len1 = str1->length;
5057 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005058
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005060 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005061
5062 c1 = *s1++;
5063 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005064
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005065 if (c1 > (1<<11) * 26)
5066 c1 += utf16Fixup[c1>>11];
5067 if (c2 > (1<<11) * 26)
5068 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005069 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005070
5071 if (c1 != c2)
5072 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005073
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005074 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075 }
5076
5077 return (len1 < len2) ? -1 : (len1 != len2);
5078}
5079
Marc-André Lemburge5034372000-08-08 08:04:29 +00005080#else
5081
5082static int
5083unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5084{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005085 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005086
5087 Py_UNICODE *s1 = str1->str;
5088 Py_UNICODE *s2 = str2->str;
5089
5090 len1 = str1->length;
5091 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005092
Marc-André Lemburge5034372000-08-08 08:04:29 +00005093 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005094 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005095
Fredrik Lundh45714e92001-06-26 16:39:36 +00005096 c1 = *s1++;
5097 c2 = *s2++;
5098
5099 if (c1 != c2)
5100 return (c1 < c2) ? -1 : 1;
5101
Marc-André Lemburge5034372000-08-08 08:04:29 +00005102 len1--; len2--;
5103 }
5104
5105 return (len1 < len2) ? -1 : (len1 != len2);
5106}
5107
5108#endif
5109
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110int PyUnicode_Compare(PyObject *left,
5111 PyObject *right)
5112{
5113 PyUnicodeObject *u = NULL, *v = NULL;
5114 int result;
5115
5116 /* Coerce the two arguments */
5117 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5118 if (u == NULL)
5119 goto onError;
5120 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5121 if (v == NULL)
5122 goto onError;
5123
Thomas Wouters7e474022000-07-16 12:04:32 +00005124 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125 if (v == u) {
5126 Py_DECREF(u);
5127 Py_DECREF(v);
5128 return 0;
5129 }
5130
5131 result = unicode_compare(u, v);
5132
5133 Py_DECREF(u);
5134 Py_DECREF(v);
5135 return result;
5136
5137onError:
5138 Py_XDECREF(u);
5139 Py_XDECREF(v);
5140 return -1;
5141}
5142
Guido van Rossum403d68b2000-03-13 15:55:09 +00005143int PyUnicode_Contains(PyObject *container,
5144 PyObject *element)
5145{
Fredrik Lundh833bf942006-05-23 10:12:21 +00005146 PyUnicodeObject *u, *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005147 int result;
5148 Py_ssize_t size;
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00005149#ifdef USE_FAST
5150 Py_ssize_t pos;
5151#endif
Guido van Rossum403d68b2000-03-13 15:55:09 +00005152
5153 /* Coerce the two arguments */
Fredrik Lundh833bf942006-05-23 10:12:21 +00005154 v = (PyUnicodeObject *) PyUnicode_FromObject(element);
5155 if (!v) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005156 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005157 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005158 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005159 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005160
5161 u = (PyUnicodeObject *) PyUnicode_FromObject(container);
5162 if (!u) {
5163 Py_DECREF(v);
5164 return -1;
5165 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005166
Barry Warsaw817918c2002-08-06 16:58:21 +00005167 size = PyUnicode_GET_SIZE(v);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005168 if (!size) {
5169 result = 1;
5170 goto done;
5171 }
Barry Warsaw817918c2002-08-06 16:58:21 +00005172
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00005173#ifdef USE_FAST
5174 pos = fastsearch(
5175 PyUnicode_AS_UNICODE(u), PyUnicode_GET_SIZE(u),
5176 PyUnicode_AS_UNICODE(v), size, FAST_SEARCH
5177 );
5178 result = (pos != -1);
5179#else
Guido van Rossum403d68b2000-03-13 15:55:09 +00005180 result = 0;
Fredrik Lundh833bf942006-05-23 10:12:21 +00005181
Barry Warsaw817918c2002-08-06 16:58:21 +00005182 if (size == 1) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00005183 Py_UNICODE chr = PyUnicode_AS_UNICODE(v)[0];
5184 Py_UNICODE* ptr = PyUnicode_AS_UNICODE(u);
5185 Py_UNICODE* end = ptr + PyUnicode_GET_SIZE(u);
5186 for (; ptr < end; ptr++) {
5187 if (*ptr == chr) {
Barry Warsaw817918c2002-08-06 16:58:21 +00005188 result = 1;
5189 break;
5190 }
5191 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005192 } else {
Fredrik Lundh240bf2a2006-05-24 10:20:36 +00005193 Py_ssize_t start = 0;
5194 Py_ssize_t end = PyUnicode_GET_SIZE(u) - size;
Fredrik Lundh833bf942006-05-23 10:12:21 +00005195 for (; start <= end; start++)
5196 if (Py_UNICODE_MATCH(u, start, v)) {
5197 result = 1;
5198 break;
5199 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005200 }
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00005201#endif
Guido van Rossum403d68b2000-03-13 15:55:09 +00005202
Fredrik Lundh833bf942006-05-23 10:12:21 +00005203done:
Guido van Rossum403d68b2000-03-13 15:55:09 +00005204 Py_DECREF(u);
5205 Py_DECREF(v);
5206 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005207}
5208
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209/* Concat to string or Unicode object giving a new Unicode object. */
5210
5211PyObject *PyUnicode_Concat(PyObject *left,
5212 PyObject *right)
5213{
5214 PyUnicodeObject *u = NULL, *v = NULL, *w;
5215
5216 /* Coerce the two arguments */
5217 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5218 if (u == NULL)
5219 goto onError;
5220 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5221 if (v == NULL)
5222 goto onError;
5223
5224 /* Shortcuts */
5225 if (v == unicode_empty) {
5226 Py_DECREF(v);
5227 return (PyObject *)u;
5228 }
5229 if (u == unicode_empty) {
5230 Py_DECREF(u);
5231 return (PyObject *)v;
5232 }
5233
5234 /* Concat the two Unicode strings */
5235 w = _PyUnicode_New(u->length + v->length);
5236 if (w == NULL)
5237 goto onError;
5238 Py_UNICODE_COPY(w->str, u->str, u->length);
5239 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5240
5241 Py_DECREF(u);
5242 Py_DECREF(v);
5243 return (PyObject *)w;
5244
5245onError:
5246 Py_XDECREF(u);
5247 Py_XDECREF(v);
5248 return NULL;
5249}
5250
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005251PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252"S.count(sub[, start[, end]]) -> int\n\
5253\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005254Return the number of non-overlapping occurrences of substring sub in\n\
5255Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005256interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257
5258static PyObject *
5259unicode_count(PyUnicodeObject *self, PyObject *args)
5260{
5261 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005262 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005263 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 PyObject *result;
5265
Guido van Rossumb8872e62000-05-09 14:14:27 +00005266 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5267 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 return NULL;
5269
5270 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5271 (PyObject *)substring);
5272 if (substring == NULL)
5273 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005274
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275 if (start < 0)
5276 start += self->length;
5277 if (start < 0)
5278 start = 0;
5279 if (end > self->length)
5280 end = self->length;
5281 if (end < 0)
5282 end += self->length;
5283 if (end < 0)
5284 end = 0;
5285
5286 result = PyInt_FromLong((long) count(self, start, end, substring));
5287
5288 Py_DECREF(substring);
5289 return result;
5290}
5291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005292PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005293"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005295Encodes S using the codec registered for encoding. encoding defaults\n\
5296to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005297handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005298a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5299'xmlcharrefreplace' as well as any other name registered with\n\
5300codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301
5302static PyObject *
5303unicode_encode(PyUnicodeObject *self, PyObject *args)
5304{
5305 char *encoding = NULL;
5306 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005307 PyObject *v;
5308
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5310 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005311 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005312 if (v == NULL)
5313 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005314 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5315 PyErr_Format(PyExc_TypeError,
5316 "encoder did not return a string/unicode object "
5317 "(type=%.400s)",
5318 v->ob_type->tp_name);
5319 Py_DECREF(v);
5320 return NULL;
5321 }
5322 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005323
5324 onError:
5325 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005326}
5327
5328PyDoc_STRVAR(decode__doc__,
5329"S.decode([encoding[,errors]]) -> string or unicode\n\
5330\n\
5331Decodes S using the codec registered for encoding. encoding defaults\n\
5332to the default encoding. errors may be given to set a different error\n\
5333handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5334a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5335as well as any other name registerd with codecs.register_error that is\n\
5336able to handle UnicodeDecodeErrors.");
5337
5338static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005339unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005340{
5341 char *encoding = NULL;
5342 char *errors = NULL;
5343 PyObject *v;
5344
5345 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5346 return NULL;
5347 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005348 if (v == NULL)
5349 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005350 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5351 PyErr_Format(PyExc_TypeError,
5352 "decoder did not return a string/unicode object "
5353 "(type=%.400s)",
5354 v->ob_type->tp_name);
5355 Py_DECREF(v);
5356 return NULL;
5357 }
5358 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005359
5360 onError:
5361 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362}
5363
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005364PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365"S.expandtabs([tabsize]) -> unicode\n\
5366\n\
5367Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005368If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369
5370static PyObject*
5371unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5372{
5373 Py_UNICODE *e;
5374 Py_UNICODE *p;
5375 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005376 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377 PyUnicodeObject *u;
5378 int tabsize = 8;
5379
5380 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5381 return NULL;
5382
Thomas Wouters7e474022000-07-16 12:04:32 +00005383 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384 i = j = 0;
5385 e = self->str + self->length;
5386 for (p = self->str; p < e; p++)
5387 if (*p == '\t') {
5388 if (tabsize > 0)
5389 j += tabsize - (j % tabsize);
5390 }
5391 else {
5392 j++;
5393 if (*p == '\n' || *p == '\r') {
5394 i += j;
5395 j = 0;
5396 }
5397 }
5398
5399 /* Second pass: create output string and fill it */
5400 u = _PyUnicode_New(i + j);
5401 if (!u)
5402 return NULL;
5403
5404 j = 0;
5405 q = u->str;
5406
5407 for (p = self->str; p < e; p++)
5408 if (*p == '\t') {
5409 if (tabsize > 0) {
5410 i = tabsize - (j % tabsize);
5411 j += i;
5412 while (i--)
5413 *q++ = ' ';
5414 }
5415 }
5416 else {
5417 j++;
5418 *q++ = *p;
5419 if (*p == '\n' || *p == '\r')
5420 j = 0;
5421 }
5422
5423 return (PyObject*) u;
5424}
5425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005426PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427"S.find(sub [,start [,end]]) -> int\n\
5428\n\
5429Return the lowest index in S where substring sub is found,\n\
5430such that sub is contained within s[start,end]. Optional\n\
5431arguments start and end are interpreted as in slice notation.\n\
5432\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005433Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434
5435static PyObject *
5436unicode_find(PyUnicodeObject *self, PyObject *args)
5437{
5438 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005439 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005440 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 PyObject *result;
5442
Guido van Rossumb8872e62000-05-09 14:14:27 +00005443 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5444 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 return NULL;
5446 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5447 (PyObject *)substring);
5448 if (substring == NULL)
5449 return NULL;
5450
Martin v. Löwis18e16552006-02-15 17:27:45 +00005451 result = PyInt_FromSsize_t(findstring(self, substring, start, end, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452
5453 Py_DECREF(substring);
5454 return result;
5455}
5456
5457static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005458unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459{
5460 if (index < 0 || index >= self->length) {
5461 PyErr_SetString(PyExc_IndexError, "string index out of range");
5462 return NULL;
5463 }
5464
5465 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5466}
5467
5468static long
5469unicode_hash(PyUnicodeObject *self)
5470{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005471 /* Since Unicode objects compare equal to their ASCII string
5472 counterparts, they should use the individual character values
5473 as basis for their hash value. This is needed to assure that
5474 strings and Unicode objects behave in the same way as
5475 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476
Martin v. Löwis18e16552006-02-15 17:27:45 +00005477 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005478 register Py_UNICODE *p;
5479 register long x;
5480
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 if (self->hash != -1)
5482 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005483 len = PyUnicode_GET_SIZE(self);
5484 p = PyUnicode_AS_UNICODE(self);
5485 x = *p << 7;
5486 while (--len >= 0)
5487 x = (1000003*x) ^ *p++;
5488 x ^= PyUnicode_GET_SIZE(self);
5489 if (x == -1)
5490 x = -2;
5491 self->hash = x;
5492 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493}
5494
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005495PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496"S.index(sub [,start [,end]]) -> int\n\
5497\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005498Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499
5500static PyObject *
5501unicode_index(PyUnicodeObject *self, PyObject *args)
5502{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005503 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005505 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005506 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507
Guido van Rossumb8872e62000-05-09 14:14:27 +00005508 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5509 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005511
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5513 (PyObject *)substring);
5514 if (substring == NULL)
5515 return NULL;
5516
5517 result = findstring(self, substring, start, end, 1);
5518
5519 Py_DECREF(substring);
5520 if (result < 0) {
5521 PyErr_SetString(PyExc_ValueError, "substring not found");
5522 return NULL;
5523 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005524 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525}
5526
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005527PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005528"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005530Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005531at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532
5533static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005534unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535{
5536 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5537 register const Py_UNICODE *e;
5538 int cased;
5539
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 /* Shortcut for single character strings */
5541 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005542 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005544 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005545 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005546 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005547
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548 e = p + PyUnicode_GET_SIZE(self);
5549 cased = 0;
5550 for (; p < e; p++) {
5551 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005552
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005554 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555 else if (!cased && Py_UNICODE_ISLOWER(ch))
5556 cased = 1;
5557 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005558 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559}
5560
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005561PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005562"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005564Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005565at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566
5567static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005568unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569{
5570 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5571 register const Py_UNICODE *e;
5572 int cased;
5573
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 /* Shortcut for single character strings */
5575 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005576 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005578 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005579 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005580 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005581
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 e = p + PyUnicode_GET_SIZE(self);
5583 cased = 0;
5584 for (; p < e; p++) {
5585 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005586
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005588 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 else if (!cased && Py_UNICODE_ISUPPER(ch))
5590 cased = 1;
5591 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005592 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593}
5594
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005595PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005596"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005598Return True if S is a titlecased string and there is at least one\n\
5599character in S, i.e. upper- and titlecase characters may only\n\
5600follow uncased characters and lowercase characters only cased ones.\n\
5601Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602
5603static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005604unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605{
5606 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5607 register const Py_UNICODE *e;
5608 int cased, previous_is_cased;
5609
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 /* Shortcut for single character strings */
5611 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005612 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5613 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005615 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005616 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005617 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005618
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 e = p + PyUnicode_GET_SIZE(self);
5620 cased = 0;
5621 previous_is_cased = 0;
5622 for (; p < e; p++) {
5623 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005624
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5626 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005627 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 previous_is_cased = 1;
5629 cased = 1;
5630 }
5631 else if (Py_UNICODE_ISLOWER(ch)) {
5632 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005633 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 previous_is_cased = 1;
5635 cased = 1;
5636 }
5637 else
5638 previous_is_cased = 0;
5639 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005640 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641}
5642
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005643PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005644"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005646Return True if all characters in S are whitespace\n\
5647and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648
5649static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005650unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651{
5652 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5653 register const Py_UNICODE *e;
5654
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 /* Shortcut for single character strings */
5656 if (PyUnicode_GET_SIZE(self) == 1 &&
5657 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005658 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005660 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005661 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005662 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005663
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664 e = p + PyUnicode_GET_SIZE(self);
5665 for (; p < e; p++) {
5666 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005667 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005669 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670}
5671
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005672PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005673"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005674\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005675Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005676and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005677
5678static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005679unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005680{
5681 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5682 register const Py_UNICODE *e;
5683
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005684 /* Shortcut for single character strings */
5685 if (PyUnicode_GET_SIZE(self) == 1 &&
5686 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005687 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005688
5689 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005690 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005691 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005692
5693 e = p + PyUnicode_GET_SIZE(self);
5694 for (; p < e; p++) {
5695 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005696 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005697 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005698 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005699}
5700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005701PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005702"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005703\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005704Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005705and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005706
5707static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005708unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005709{
5710 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5711 register const Py_UNICODE *e;
5712
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005713 /* Shortcut for single character strings */
5714 if (PyUnicode_GET_SIZE(self) == 1 &&
5715 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005716 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005717
5718 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005719 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005720 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005721
5722 e = p + PyUnicode_GET_SIZE(self);
5723 for (; p < e; p++) {
5724 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005725 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005726 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005727 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005728}
5729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005730PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005731"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005733Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005734False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735
5736static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005737unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738{
5739 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5740 register const Py_UNICODE *e;
5741
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 /* Shortcut for single character strings */
5743 if (PyUnicode_GET_SIZE(self) == 1 &&
5744 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005745 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005747 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005748 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005749 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005750
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 e = p + PyUnicode_GET_SIZE(self);
5752 for (; p < e; p++) {
5753 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005754 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005756 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757}
5758
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005759PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005760"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005762Return True if all characters in S are digits\n\
5763and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764
5765static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005766unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767{
5768 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5769 register const Py_UNICODE *e;
5770
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771 /* Shortcut for single character strings */
5772 if (PyUnicode_GET_SIZE(self) == 1 &&
5773 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005774 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005776 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005777 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005778 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005779
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780 e = p + PyUnicode_GET_SIZE(self);
5781 for (; p < e; p++) {
5782 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005783 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005785 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786}
5787
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005788PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005789"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005791Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005792False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793
5794static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005795unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796{
5797 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5798 register const Py_UNICODE *e;
5799
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 /* Shortcut for single character strings */
5801 if (PyUnicode_GET_SIZE(self) == 1 &&
5802 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005803 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005805 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005806 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005807 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005808
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809 e = p + PyUnicode_GET_SIZE(self);
5810 for (; p < e; p++) {
5811 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005812 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005814 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815}
5816
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005817PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818"S.join(sequence) -> unicode\n\
5819\n\
5820Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005821sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822
5823static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005824unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005826 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827}
5828
Martin v. Löwis18e16552006-02-15 17:27:45 +00005829static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830unicode_length(PyUnicodeObject *self)
5831{
5832 return self->length;
5833}
5834
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005835PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005836"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837\n\
5838Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005839done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840
5841static PyObject *
5842unicode_ljust(PyUnicodeObject *self, PyObject *args)
5843{
Martin v. Löwis412fb672006-04-13 06:34:32 +00005844 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005845 Py_UNICODE fillchar = ' ';
5846
Martin v. Löwis412fb672006-04-13 06:34:32 +00005847 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 return NULL;
5849
Tim Peters7a29bd52001-09-12 03:03:31 +00005850 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 Py_INCREF(self);
5852 return (PyObject*) self;
5853 }
5854
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005855 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856}
5857
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005858PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859"S.lower() -> unicode\n\
5860\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005861Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862
5863static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005864unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866 return fixup(self, fixlower);
5867}
5868
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005869#define LEFTSTRIP 0
5870#define RIGHTSTRIP 1
5871#define BOTHSTRIP 2
5872
5873/* Arrays indexed by above */
5874static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5875
5876#define STRIPNAME(i) (stripformat[i]+3)
5877
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005878/* externally visible for str.strip(unicode) */
5879PyObject *
5880_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5881{
5882 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005883 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005884 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005885 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
5886 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005887
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005888 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
5889
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005890 i = 0;
5891 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005892 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
5893 i++;
5894 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005895 }
5896
5897 j = len;
5898 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005899 do {
5900 j--;
5901 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
5902 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005903 }
5904
5905 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005906 Py_INCREF(self);
5907 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005908 }
5909 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005910 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005911}
5912
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913
5914static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005915do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005917 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005918 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005919
5920 i = 0;
5921 if (striptype != RIGHTSTRIP) {
5922 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5923 i++;
5924 }
5925 }
5926
5927 j = len;
5928 if (striptype != LEFTSTRIP) {
5929 do {
5930 j--;
5931 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5932 j++;
5933 }
5934
5935 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5936 Py_INCREF(self);
5937 return (PyObject*)self;
5938 }
5939 else
5940 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941}
5942
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005943
5944static PyObject *
5945do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5946{
5947 PyObject *sep = NULL;
5948
5949 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5950 return NULL;
5951
5952 if (sep != NULL && sep != Py_None) {
5953 if (PyUnicode_Check(sep))
5954 return _PyUnicode_XStrip(self, striptype, sep);
5955 else if (PyString_Check(sep)) {
5956 PyObject *res;
5957 sep = PyUnicode_FromObject(sep);
5958 if (sep==NULL)
5959 return NULL;
5960 res = _PyUnicode_XStrip(self, striptype, sep);
5961 Py_DECREF(sep);
5962 return res;
5963 }
5964 else {
5965 PyErr_Format(PyExc_TypeError,
5966 "%s arg must be None, unicode or str",
5967 STRIPNAME(striptype));
5968 return NULL;
5969 }
5970 }
5971
5972 return do_strip(self, striptype);
5973}
5974
5975
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005976PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005977"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005978\n\
5979Return a copy of the string S with leading and trailing\n\
5980whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005981If chars is given and not None, remove characters in chars instead.\n\
5982If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005983
5984static PyObject *
5985unicode_strip(PyUnicodeObject *self, PyObject *args)
5986{
5987 if (PyTuple_GET_SIZE(args) == 0)
5988 return do_strip(self, BOTHSTRIP); /* Common case */
5989 else
5990 return do_argstrip(self, BOTHSTRIP, args);
5991}
5992
5993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005994PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005995"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005996\n\
5997Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005998If chars is given and not None, remove characters in chars instead.\n\
5999If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006000
6001static PyObject *
6002unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6003{
6004 if (PyTuple_GET_SIZE(args) == 0)
6005 return do_strip(self, LEFTSTRIP); /* Common case */
6006 else
6007 return do_argstrip(self, LEFTSTRIP, args);
6008}
6009
6010
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006011PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006012"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006013\n\
6014Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006015If chars is given and not None, remove characters in chars instead.\n\
6016If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006017
6018static PyObject *
6019unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6020{
6021 if (PyTuple_GET_SIZE(args) == 0)
6022 return do_strip(self, RIGHTSTRIP); /* Common case */
6023 else
6024 return do_argstrip(self, RIGHTSTRIP, args);
6025}
6026
6027
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006029unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030{
6031 PyUnicodeObject *u;
6032 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006033 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006034 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035
6036 if (len < 0)
6037 len = 0;
6038
Tim Peters7a29bd52001-09-12 03:03:31 +00006039 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 /* no repeat, return original string */
6041 Py_INCREF(str);
6042 return (PyObject*) str;
6043 }
Tim Peters8f422462000-09-09 06:13:41 +00006044
6045 /* ensure # of chars needed doesn't overflow int and # of bytes
6046 * needed doesn't overflow size_t
6047 */
6048 nchars = len * str->length;
6049 if (len && nchars / len != str->length) {
6050 PyErr_SetString(PyExc_OverflowError,
6051 "repeated string is too long");
6052 return NULL;
6053 }
6054 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6055 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6056 PyErr_SetString(PyExc_OverflowError,
6057 "repeated string is too long");
6058 return NULL;
6059 }
6060 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 if (!u)
6062 return NULL;
6063
6064 p = u->str;
6065
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006066 if (str->length == 1 && len > 0) {
6067 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006068 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006069 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006070 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006071 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006072 done = str->length;
6073 }
6074 while (done < nchars) {
6075 int n = (done <= nchars-done) ? done : nchars-done;
6076 Py_UNICODE_COPY(p+done, p, n);
6077 done += n;
6078 }
6079 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080
6081 return (PyObject*) u;
6082}
6083
6084PyObject *PyUnicode_Replace(PyObject *obj,
6085 PyObject *subobj,
6086 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006087 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088{
6089 PyObject *self;
6090 PyObject *str1;
6091 PyObject *str2;
6092 PyObject *result;
6093
6094 self = PyUnicode_FromObject(obj);
6095 if (self == NULL)
6096 return NULL;
6097 str1 = PyUnicode_FromObject(subobj);
6098 if (str1 == NULL) {
6099 Py_DECREF(self);
6100 return NULL;
6101 }
6102 str2 = PyUnicode_FromObject(replobj);
6103 if (str2 == NULL) {
6104 Py_DECREF(self);
6105 Py_DECREF(str1);
6106 return NULL;
6107 }
Tim Petersced69f82003-09-16 20:30:58 +00006108 result = replace((PyUnicodeObject *)self,
6109 (PyUnicodeObject *)str1,
6110 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 maxcount);
6112 Py_DECREF(self);
6113 Py_DECREF(str1);
6114 Py_DECREF(str2);
6115 return result;
6116}
6117
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006118PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119"S.replace (old, new[, maxsplit]) -> unicode\n\
6120\n\
6121Return a copy of S with all occurrences of substring\n\
6122old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006123given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124
6125static PyObject*
6126unicode_replace(PyUnicodeObject *self, PyObject *args)
6127{
6128 PyUnicodeObject *str1;
6129 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006130 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 PyObject *result;
6132
Martin v. Löwis18e16552006-02-15 17:27:45 +00006133 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 return NULL;
6135 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6136 if (str1 == NULL)
6137 return NULL;
6138 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006139 if (str2 == NULL) {
6140 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006142 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143
6144 result = replace(self, str1, str2, maxcount);
6145
6146 Py_DECREF(str1);
6147 Py_DECREF(str2);
6148 return result;
6149}
6150
6151static
6152PyObject *unicode_repr(PyObject *unicode)
6153{
6154 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6155 PyUnicode_GET_SIZE(unicode),
6156 1);
6157}
6158
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006159PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160"S.rfind(sub [,start [,end]]) -> int\n\
6161\n\
6162Return the highest index in S where substring sub is found,\n\
6163such that sub is contained within s[start,end]. Optional\n\
6164arguments start and end are interpreted as in slice notation.\n\
6165\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006166Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167
6168static PyObject *
6169unicode_rfind(PyUnicodeObject *self, PyObject *args)
6170{
6171 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006172 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006173 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174 PyObject *result;
6175
Guido van Rossumb8872e62000-05-09 14:14:27 +00006176 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6177 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 return NULL;
6179 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6180 (PyObject *)substring);
6181 if (substring == NULL)
6182 return NULL;
6183
Martin v. Löwis18e16552006-02-15 17:27:45 +00006184 result = PyInt_FromSsize_t(findstring(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185
6186 Py_DECREF(substring);
6187 return result;
6188}
6189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006190PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191"S.rindex(sub [,start [,end]]) -> int\n\
6192\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006193Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194
6195static PyObject *
6196unicode_rindex(PyUnicodeObject *self, PyObject *args)
6197{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006198 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006200 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006201 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202
Guido van Rossumb8872e62000-05-09 14:14:27 +00006203 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6204 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 return NULL;
6206 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6207 (PyObject *)substring);
6208 if (substring == NULL)
6209 return NULL;
6210
6211 result = findstring(self, substring, start, end, -1);
6212
6213 Py_DECREF(substring);
6214 if (result < 0) {
6215 PyErr_SetString(PyExc_ValueError, "substring not found");
6216 return NULL;
6217 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006218 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219}
6220
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006221PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006222"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223\n\
6224Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006225done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226
6227static PyObject *
6228unicode_rjust(PyUnicodeObject *self, PyObject *args)
6229{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006230 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006231 Py_UNICODE fillchar = ' ';
6232
Martin v. Löwis412fb672006-04-13 06:34:32 +00006233 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234 return NULL;
6235
Tim Peters7a29bd52001-09-12 03:03:31 +00006236 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 Py_INCREF(self);
6238 return (PyObject*) self;
6239 }
6240
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006241 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242}
6243
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006245unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246{
6247 /* standard clamping */
6248 if (start < 0)
6249 start = 0;
6250 if (end < 0)
6251 end = 0;
6252 if (end > self->length)
6253 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006254 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255 /* full slice, return original string */
6256 Py_INCREF(self);
6257 return (PyObject*) self;
6258 }
6259 if (start > end)
6260 start = end;
6261 /* copy slice */
6262 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6263 end - start);
6264}
6265
6266PyObject *PyUnicode_Split(PyObject *s,
6267 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006268 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269{
6270 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006271
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 s = PyUnicode_FromObject(s);
6273 if (s == NULL)
6274 return NULL;
6275 if (sep != NULL) {
6276 sep = PyUnicode_FromObject(sep);
6277 if (sep == NULL) {
6278 Py_DECREF(s);
6279 return NULL;
6280 }
6281 }
6282
6283 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6284
6285 Py_DECREF(s);
6286 Py_XDECREF(sep);
6287 return result;
6288}
6289
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006290PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291"S.split([sep [,maxsplit]]) -> list of strings\n\
6292\n\
6293Return a list of the words in S, using sep as the\n\
6294delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006295splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006296any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297
6298static PyObject*
6299unicode_split(PyUnicodeObject *self, PyObject *args)
6300{
6301 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006302 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303
Martin v. Löwis18e16552006-02-15 17:27:45 +00006304 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 return NULL;
6306
6307 if (substring == Py_None)
6308 return split(self, NULL, maxcount);
6309 else if (PyUnicode_Check(substring))
6310 return split(self, (PyUnicodeObject *)substring, maxcount);
6311 else
6312 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6313}
6314
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006315PyObject *PyUnicode_RSplit(PyObject *s,
6316 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006317 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006318{
6319 PyObject *result;
6320
6321 s = PyUnicode_FromObject(s);
6322 if (s == NULL)
6323 return NULL;
6324 if (sep != NULL) {
6325 sep = PyUnicode_FromObject(sep);
6326 if (sep == NULL) {
6327 Py_DECREF(s);
6328 return NULL;
6329 }
6330 }
6331
6332 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6333
6334 Py_DECREF(s);
6335 Py_XDECREF(sep);
6336 return result;
6337}
6338
6339PyDoc_STRVAR(rsplit__doc__,
6340"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6341\n\
6342Return a list of the words in S, using sep as the\n\
6343delimiter string, starting at the end of the string and\n\
6344working to the front. If maxsplit is given, at most maxsplit\n\
6345splits are done. If sep is not specified, any whitespace string\n\
6346is a separator.");
6347
6348static PyObject*
6349unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6350{
6351 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006352 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006353
Martin v. Löwis18e16552006-02-15 17:27:45 +00006354 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006355 return NULL;
6356
6357 if (substring == Py_None)
6358 return rsplit(self, NULL, maxcount);
6359 else if (PyUnicode_Check(substring))
6360 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6361 else
6362 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6363}
6364
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006365PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006366"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367\n\
6368Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006369Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006370is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371
6372static PyObject*
6373unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6374{
Guido van Rossum86662912000-04-11 15:38:46 +00006375 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376
Guido van Rossum86662912000-04-11 15:38:46 +00006377 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 return NULL;
6379
Guido van Rossum86662912000-04-11 15:38:46 +00006380 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381}
6382
6383static
6384PyObject *unicode_str(PyUnicodeObject *self)
6385{
Fred Drakee4315f52000-05-09 19:53:39 +00006386 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387}
6388
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006389PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390"S.swapcase() -> unicode\n\
6391\n\
6392Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006393and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394
6395static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006396unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398 return fixup(self, fixswapcase);
6399}
6400
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006401PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402"S.translate(table) -> unicode\n\
6403\n\
6404Return a copy of the string S, where all characters have been mapped\n\
6405through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006406Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6407Unmapped characters are left untouched. Characters mapped to None\n\
6408are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409
6410static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006411unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412{
Tim Petersced69f82003-09-16 20:30:58 +00006413 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006415 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 "ignore");
6417}
6418
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006419PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420"S.upper() -> unicode\n\
6421\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006422Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423
6424static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006425unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427 return fixup(self, fixupper);
6428}
6429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006430PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431"S.zfill(width) -> unicode\n\
6432\n\
6433Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006434of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435
6436static PyObject *
6437unicode_zfill(PyUnicodeObject *self, PyObject *args)
6438{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006439 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 PyUnicodeObject *u;
6441
Martin v. Löwis18e16552006-02-15 17:27:45 +00006442 Py_ssize_t width;
6443 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 return NULL;
6445
6446 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006447 if (PyUnicode_CheckExact(self)) {
6448 Py_INCREF(self);
6449 return (PyObject*) self;
6450 }
6451 else
6452 return PyUnicode_FromUnicode(
6453 PyUnicode_AS_UNICODE(self),
6454 PyUnicode_GET_SIZE(self)
6455 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 }
6457
6458 fill = width - self->length;
6459
6460 u = pad(self, fill, 0, '0');
6461
Walter Dörwald068325e2002-04-15 13:36:47 +00006462 if (u == NULL)
6463 return NULL;
6464
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 if (u->str[fill] == '+' || u->str[fill] == '-') {
6466 /* move sign to beginning of string */
6467 u->str[0] = u->str[fill];
6468 u->str[fill] = '0';
6469 }
6470
6471 return (PyObject*) u;
6472}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473
6474#if 0
6475static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006476unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 return PyInt_FromLong(unicode_freelist_size);
6479}
6480#endif
6481
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006482PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006483"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006485Return True if S starts with the specified prefix, False otherwise.\n\
6486With optional start, test S beginning at that position.\n\
6487With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488
6489static PyObject *
6490unicode_startswith(PyUnicodeObject *self,
6491 PyObject *args)
6492{
6493 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006494 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006495 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 PyObject *result;
6497
Guido van Rossumb8872e62000-05-09 14:14:27 +00006498 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6499 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500 return NULL;
6501 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6502 (PyObject *)substring);
6503 if (substring == NULL)
6504 return NULL;
6505
Guido van Rossum77f6a652002-04-03 22:41:51 +00006506 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507
6508 Py_DECREF(substring);
6509 return result;
6510}
6511
6512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006513PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006514"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006516Return True if S ends with the specified suffix, False otherwise.\n\
6517With optional start, test S beginning at that position.\n\
6518With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519
6520static PyObject *
6521unicode_endswith(PyUnicodeObject *self,
6522 PyObject *args)
6523{
6524 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006525 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006526 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527 PyObject *result;
6528
Guido van Rossumb8872e62000-05-09 14:14:27 +00006529 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6530 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 return NULL;
6532 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6533 (PyObject *)substring);
6534 if (substring == NULL)
6535 return NULL;
6536
Guido van Rossum77f6a652002-04-03 22:41:51 +00006537 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538
6539 Py_DECREF(substring);
6540 return result;
6541}
6542
6543
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006544
6545static PyObject *
6546unicode_getnewargs(PyUnicodeObject *v)
6547{
6548 return Py_BuildValue("(u#)", v->str, v->length);
6549}
6550
6551
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552static PyMethodDef unicode_methods[] = {
6553
6554 /* Order is according to common usage: often used methods should
6555 appear first, since lookup is done sequentially. */
6556
Georg Brandlecdc0a92006-03-30 12:19:07 +00006557 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006558 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6559 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006560 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006561 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6562 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6563 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6564 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6565 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6566 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6567 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6568 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6569 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6570 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006571 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006572 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006573/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6574 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6575 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6576 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006577 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006578 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006579 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006580 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6581 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6582 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6583 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6584 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6585 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6586 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6587 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6588 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6589 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6590 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6591 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6592 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6593 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006594 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006595#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006596 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597#endif
6598
6599#if 0
6600 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006601 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602#endif
6603
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006604 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 {NULL, NULL}
6606};
6607
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006608static PyObject *
6609unicode_mod(PyObject *v, PyObject *w)
6610{
6611 if (!PyUnicode_Check(v)) {
6612 Py_INCREF(Py_NotImplemented);
6613 return Py_NotImplemented;
6614 }
6615 return PyUnicode_Format(v, w);
6616}
6617
6618static PyNumberMethods unicode_as_number = {
6619 0, /*nb_add*/
6620 0, /*nb_subtract*/
6621 0, /*nb_multiply*/
6622 0, /*nb_divide*/
6623 unicode_mod, /*nb_remainder*/
6624};
6625
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006627 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00006628 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006629 (ssizeargfunc) unicode_repeat, /* sq_repeat */
6630 (ssizeargfunc) unicode_getitem, /* sq_item */
6631 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 0, /* sq_ass_item */
6633 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00006634 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635};
6636
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006637#define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
6638
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006639static PyObject*
6640unicode_subscript(PyUnicodeObject* self, PyObject* item)
6641{
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006642 PyNumberMethods *nb = item->ob_type->tp_as_number;
6643 if (nb != NULL && HASINDEX(item) && nb->nb_index != NULL) {
6644 Py_ssize_t i = nb->nb_index(item);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006645 if (i == -1 && PyErr_Occurred())
6646 return NULL;
6647 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006648 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006649 return unicode_getitem(self, i);
6650 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006651 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006652 Py_UNICODE* source_buf;
6653 Py_UNICODE* result_buf;
6654 PyObject* result;
6655
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006656 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006657 &start, &stop, &step, &slicelength) < 0) {
6658 return NULL;
6659 }
6660
6661 if (slicelength <= 0) {
6662 return PyUnicode_FromUnicode(NULL, 0);
6663 } else {
6664 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00006665 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
6666 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006667
6668 if (result_buf == NULL)
6669 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006670
6671 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6672 result_buf[i] = source_buf[cur];
6673 }
Tim Petersced69f82003-09-16 20:30:58 +00006674
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006675 result = PyUnicode_FromUnicode(result_buf, slicelength);
6676 PyMem_FREE(result_buf);
6677 return result;
6678 }
6679 } else {
6680 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6681 return NULL;
6682 }
6683}
6684
6685static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006686 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006687 (binaryfunc)unicode_subscript, /* mp_subscript */
6688 (objobjargproc)0, /* mp_ass_subscript */
6689};
6690
Martin v. Löwis18e16552006-02-15 17:27:45 +00006691static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006693 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694 const void **ptr)
6695{
6696 if (index != 0) {
6697 PyErr_SetString(PyExc_SystemError,
6698 "accessing non-existent unicode segment");
6699 return -1;
6700 }
6701 *ptr = (void *) self->str;
6702 return PyUnicode_GET_DATA_SIZE(self);
6703}
6704
Martin v. Löwis18e16552006-02-15 17:27:45 +00006705static Py_ssize_t
6706unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707 const void **ptr)
6708{
6709 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006710 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 return -1;
6712}
6713
6714static int
6715unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006716 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717{
6718 if (lenp)
6719 *lenp = PyUnicode_GET_DATA_SIZE(self);
6720 return 1;
6721}
6722
Martin v. Löwiseb079f12006-02-16 14:32:27 +00006723static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006725 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726 const void **ptr)
6727{
6728 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006729
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730 if (index != 0) {
6731 PyErr_SetString(PyExc_SystemError,
6732 "accessing non-existent unicode segment");
6733 return -1;
6734 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006735 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 if (str == NULL)
6737 return -1;
6738 *ptr = (void *) PyString_AS_STRING(str);
6739 return PyString_GET_SIZE(str);
6740}
6741
6742/* Helpers for PyUnicode_Format() */
6743
6744static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006745getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006747 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748 if (argidx < arglen) {
6749 (*p_argidx)++;
6750 if (arglen < 0)
6751 return args;
6752 else
6753 return PyTuple_GetItem(args, argidx);
6754 }
6755 PyErr_SetString(PyExc_TypeError,
6756 "not enough arguments for format string");
6757 return NULL;
6758}
6759
6760#define F_LJUST (1<<0)
6761#define F_SIGN (1<<1)
6762#define F_BLANK (1<<2)
6763#define F_ALT (1<<3)
6764#define F_ZERO (1<<4)
6765
Martin v. Löwis18e16552006-02-15 17:27:45 +00006766static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00006767strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006769 register Py_ssize_t i;
6770 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 for (i = len - 1; i >= 0; i--)
6772 buffer[i] = (Py_UNICODE) charbuffer[i];
6773
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 return len;
6775}
6776
Neal Norwitzfc76d632006-01-10 06:03:13 +00006777static int
6778doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
6779{
Tim Peters15231542006-02-16 01:08:01 +00006780 Py_ssize_t result;
6781
Neal Norwitzfc76d632006-01-10 06:03:13 +00006782 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006783 result = strtounicode(buffer, (char *)buffer);
6784 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006785}
6786
6787static int
6788longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
6789{
Tim Peters15231542006-02-16 01:08:01 +00006790 Py_ssize_t result;
6791
Neal Norwitzfc76d632006-01-10 06:03:13 +00006792 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006793 result = strtounicode(buffer, (char *)buffer);
6794 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006795}
6796
Guido van Rossum078151d2002-08-11 04:24:12 +00006797/* XXX To save some code duplication, formatfloat/long/int could have been
6798 shared with stringobject.c, converting from 8-bit to Unicode after the
6799 formatting is done. */
6800
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801static int
6802formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006803 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804 int flags,
6805 int prec,
6806 int type,
6807 PyObject *v)
6808{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006809 /* fmt = '%#.' + `prec` + `type`
6810 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811 char fmt[20];
6812 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006813
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 x = PyFloat_AsDouble(v);
6815 if (x == -1.0 && PyErr_Occurred())
6816 return -1;
6817 if (prec < 0)
6818 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6820 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006821 /* Worst case length calc to ensure no buffer overrun:
6822
6823 'g' formats:
6824 fmt = %#.<prec>g
6825 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6826 for any double rep.)
6827 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6828
6829 'f' formats:
6830 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6831 len = 1 + 50 + 1 + prec = 52 + prec
6832
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006833 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006834 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006835
6836 */
6837 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6838 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006839 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006840 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006841 return -1;
6842 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006843 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6844 (flags&F_ALT) ? "#" : "",
6845 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006846 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847}
6848
Tim Peters38fd5b62000-09-21 05:43:11 +00006849static PyObject*
6850formatlong(PyObject *val, int flags, int prec, int type)
6851{
6852 char *buf;
6853 int i, len;
6854 PyObject *str; /* temporary string object. */
6855 PyUnicodeObject *result;
6856
6857 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6858 if (!str)
6859 return NULL;
6860 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006861 if (!result) {
6862 Py_DECREF(str);
6863 return NULL;
6864 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006865 for (i = 0; i < len; i++)
6866 result->str[i] = buf[i];
6867 result->str[len] = 0;
6868 Py_DECREF(str);
6869 return (PyObject*)result;
6870}
6871
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872static int
6873formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006874 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875 int flags,
6876 int prec,
6877 int type,
6878 PyObject *v)
6879{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006880 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006881 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6882 * + 1 + 1
6883 * = 24
6884 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006885 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006886 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 long x;
6888
6889 x = PyInt_AsLong(v);
6890 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006891 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006892 if (x < 0 && type == 'u') {
6893 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006894 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006895 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6896 sign = "-";
6897 else
6898 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006900 prec = 1;
6901
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006902 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6903 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006904 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006905 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006906 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006907 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006908 return -1;
6909 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006910
6911 if ((flags & F_ALT) &&
6912 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006913 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006914 * of issues that cause pain:
6915 * - when 0 is being converted, the C standard leaves off
6916 * the '0x' or '0X', which is inconsistent with other
6917 * %#x/%#X conversions and inconsistent with Python's
6918 * hex() function
6919 * - there are platforms that violate the standard and
6920 * convert 0 with the '0x' or '0X'
6921 * (Metrowerks, Compaq Tru64)
6922 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006923 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006924 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006925 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006926 * We can achieve the desired consistency by inserting our
6927 * own '0x' or '0X' prefix, and substituting %x/%X in place
6928 * of %#x/%#X.
6929 *
6930 * Note that this is the same approach as used in
6931 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006932 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006933 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6934 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006935 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006936 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006937 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6938 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006939 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006940 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006941 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00006942 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006943 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00006944 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945}
6946
6947static int
6948formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006949 size_t buflen,
6950 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006952 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006953 if (PyUnicode_Check(v)) {
6954 if (PyUnicode_GET_SIZE(v) != 1)
6955 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006957 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006959 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006960 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006961 goto onError;
6962 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6963 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964
6965 else {
6966 /* Integer input truncated to a character */
6967 long x;
6968 x = PyInt_AsLong(v);
6969 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006970 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006971#ifdef Py_UNICODE_WIDE
6972 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006973 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006974 "%c arg not in range(0x110000) "
6975 "(wide Python build)");
6976 return -1;
6977 }
6978#else
6979 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006980 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006981 "%c arg not in range(0x10000) "
6982 "(narrow Python build)");
6983 return -1;
6984 }
6985#endif
6986 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987 }
6988 buf[1] = '\0';
6989 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006990
6991 onError:
6992 PyErr_SetString(PyExc_TypeError,
6993 "%c requires int or char");
6994 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995}
6996
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006997/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6998
6999 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7000 chars are formatted. XXX This is a magic number. Each formatting
7001 routine does bounds checking to ensure no overflow, but a better
7002 solution may be to malloc a buffer of appropriate size for each
7003 format. For now, the current solution is sufficient.
7004*/
7005#define FORMATBUFLEN (size_t)120
7006
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007PyObject *PyUnicode_Format(PyObject *format,
7008 PyObject *args)
7009{
7010 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007011 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012 int args_owned = 0;
7013 PyUnicodeObject *result = NULL;
7014 PyObject *dict = NULL;
7015 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007016
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017 if (format == NULL || args == NULL) {
7018 PyErr_BadInternalCall();
7019 return NULL;
7020 }
7021 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007022 if (uformat == NULL)
7023 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024 fmt = PyUnicode_AS_UNICODE(uformat);
7025 fmtcnt = PyUnicode_GET_SIZE(uformat);
7026
7027 reslen = rescnt = fmtcnt + 100;
7028 result = _PyUnicode_New(reslen);
7029 if (result == NULL)
7030 goto onError;
7031 res = PyUnicode_AS_UNICODE(result);
7032
7033 if (PyTuple_Check(args)) {
7034 arglen = PyTuple_Size(args);
7035 argidx = 0;
7036 }
7037 else {
7038 arglen = -1;
7039 argidx = -2;
7040 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007041 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7042 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043 dict = args;
7044
7045 while (--fmtcnt >= 0) {
7046 if (*fmt != '%') {
7047 if (--rescnt < 0) {
7048 rescnt = fmtcnt + 100;
7049 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007050 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007051 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7053 --rescnt;
7054 }
7055 *res++ = *fmt++;
7056 }
7057 else {
7058 /* Got a format specifier */
7059 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007060 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 Py_UNICODE c = '\0';
7063 Py_UNICODE fill;
7064 PyObject *v = NULL;
7065 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007066 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007068 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007069 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070
7071 fmt++;
7072 if (*fmt == '(') {
7073 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007074 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075 PyObject *key;
7076 int pcount = 1;
7077
7078 if (dict == NULL) {
7079 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007080 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081 goto onError;
7082 }
7083 ++fmt;
7084 --fmtcnt;
7085 keystart = fmt;
7086 /* Skip over balanced parentheses */
7087 while (pcount > 0 && --fmtcnt >= 0) {
7088 if (*fmt == ')')
7089 --pcount;
7090 else if (*fmt == '(')
7091 ++pcount;
7092 fmt++;
7093 }
7094 keylen = fmt - keystart - 1;
7095 if (fmtcnt < 0 || pcount > 0) {
7096 PyErr_SetString(PyExc_ValueError,
7097 "incomplete format key");
7098 goto onError;
7099 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007100#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007101 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102 then looked up since Python uses strings to hold
7103 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007104 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105 key = PyUnicode_EncodeUTF8(keystart,
7106 keylen,
7107 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007108#else
7109 key = PyUnicode_FromUnicode(keystart, keylen);
7110#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111 if (key == NULL)
7112 goto onError;
7113 if (args_owned) {
7114 Py_DECREF(args);
7115 args_owned = 0;
7116 }
7117 args = PyObject_GetItem(dict, key);
7118 Py_DECREF(key);
7119 if (args == NULL) {
7120 goto onError;
7121 }
7122 args_owned = 1;
7123 arglen = -1;
7124 argidx = -2;
7125 }
7126 while (--fmtcnt >= 0) {
7127 switch (c = *fmt++) {
7128 case '-': flags |= F_LJUST; continue;
7129 case '+': flags |= F_SIGN; continue;
7130 case ' ': flags |= F_BLANK; continue;
7131 case '#': flags |= F_ALT; continue;
7132 case '0': flags |= F_ZERO; continue;
7133 }
7134 break;
7135 }
7136 if (c == '*') {
7137 v = getnextarg(args, arglen, &argidx);
7138 if (v == NULL)
7139 goto onError;
7140 if (!PyInt_Check(v)) {
7141 PyErr_SetString(PyExc_TypeError,
7142 "* wants int");
7143 goto onError;
7144 }
7145 width = PyInt_AsLong(v);
7146 if (width < 0) {
7147 flags |= F_LJUST;
7148 width = -width;
7149 }
7150 if (--fmtcnt >= 0)
7151 c = *fmt++;
7152 }
7153 else if (c >= '0' && c <= '9') {
7154 width = c - '0';
7155 while (--fmtcnt >= 0) {
7156 c = *fmt++;
7157 if (c < '0' || c > '9')
7158 break;
7159 if ((width*10) / 10 != width) {
7160 PyErr_SetString(PyExc_ValueError,
7161 "width too big");
7162 goto onError;
7163 }
7164 width = width*10 + (c - '0');
7165 }
7166 }
7167 if (c == '.') {
7168 prec = 0;
7169 if (--fmtcnt >= 0)
7170 c = *fmt++;
7171 if (c == '*') {
7172 v = getnextarg(args, arglen, &argidx);
7173 if (v == NULL)
7174 goto onError;
7175 if (!PyInt_Check(v)) {
7176 PyErr_SetString(PyExc_TypeError,
7177 "* wants int");
7178 goto onError;
7179 }
7180 prec = PyInt_AsLong(v);
7181 if (prec < 0)
7182 prec = 0;
7183 if (--fmtcnt >= 0)
7184 c = *fmt++;
7185 }
7186 else if (c >= '0' && c <= '9') {
7187 prec = c - '0';
7188 while (--fmtcnt >= 0) {
7189 c = Py_CHARMASK(*fmt++);
7190 if (c < '0' || c > '9')
7191 break;
7192 if ((prec*10) / 10 != prec) {
7193 PyErr_SetString(PyExc_ValueError,
7194 "prec too big");
7195 goto onError;
7196 }
7197 prec = prec*10 + (c - '0');
7198 }
7199 }
7200 } /* prec */
7201 if (fmtcnt >= 0) {
7202 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203 if (--fmtcnt >= 0)
7204 c = *fmt++;
7205 }
7206 }
7207 if (fmtcnt < 0) {
7208 PyErr_SetString(PyExc_ValueError,
7209 "incomplete format");
7210 goto onError;
7211 }
7212 if (c != '%') {
7213 v = getnextarg(args, arglen, &argidx);
7214 if (v == NULL)
7215 goto onError;
7216 }
7217 sign = 0;
7218 fill = ' ';
7219 switch (c) {
7220
7221 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007222 pbuf = formatbuf;
7223 /* presume that buffer length is at least 1 */
7224 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225 len = 1;
7226 break;
7227
7228 case 's':
7229 case 'r':
7230 if (PyUnicode_Check(v) && c == 's') {
7231 temp = v;
7232 Py_INCREF(temp);
7233 }
7234 else {
7235 PyObject *unicode;
7236 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007237 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238 else
7239 temp = PyObject_Repr(v);
7240 if (temp == NULL)
7241 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007242 if (PyUnicode_Check(temp))
7243 /* nothing to do */;
7244 else if (PyString_Check(temp)) {
7245 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007246 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007248 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007250 Py_DECREF(temp);
7251 temp = unicode;
7252 if (temp == NULL)
7253 goto onError;
7254 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007255 else {
7256 Py_DECREF(temp);
7257 PyErr_SetString(PyExc_TypeError,
7258 "%s argument has non-string str()");
7259 goto onError;
7260 }
7261 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007262 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263 len = PyUnicode_GET_SIZE(temp);
7264 if (prec >= 0 && len > prec)
7265 len = prec;
7266 break;
7267
7268 case 'i':
7269 case 'd':
7270 case 'u':
7271 case 'o':
7272 case 'x':
7273 case 'X':
7274 if (c == 'i')
7275 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007276 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007277 temp = formatlong(v, flags, prec, c);
7278 if (!temp)
7279 goto onError;
7280 pbuf = PyUnicode_AS_UNICODE(temp);
7281 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007282 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007284 else {
7285 pbuf = formatbuf;
7286 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7287 flags, prec, c, v);
7288 if (len < 0)
7289 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007290 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007291 }
7292 if (flags & F_ZERO)
7293 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294 break;
7295
7296 case 'e':
7297 case 'E':
7298 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007299 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300 case 'g':
7301 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007302 if (c == 'F')
7303 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007304 pbuf = formatbuf;
7305 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7306 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307 if (len < 0)
7308 goto onError;
7309 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007310 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311 fill = '0';
7312 break;
7313
7314 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007315 pbuf = formatbuf;
7316 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317 if (len < 0)
7318 goto onError;
7319 break;
7320
7321 default:
7322 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007323 "unsupported format character '%c' (0x%x) "
7324 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007325 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007326 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007327 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328 goto onError;
7329 }
7330 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007331 if (*pbuf == '-' || *pbuf == '+') {
7332 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333 len--;
7334 }
7335 else if (flags & F_SIGN)
7336 sign = '+';
7337 else if (flags & F_BLANK)
7338 sign = ' ';
7339 else
7340 sign = 0;
7341 }
7342 if (width < len)
7343 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007344 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345 reslen -= rescnt;
7346 rescnt = width + fmtcnt + 100;
7347 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007348 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007349 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007350 PyErr_NoMemory();
7351 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007352 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007353 if (_PyUnicode_Resize(&result, reslen) < 0) {
7354 Py_XDECREF(temp);
7355 goto onError;
7356 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357 res = PyUnicode_AS_UNICODE(result)
7358 + reslen - rescnt;
7359 }
7360 if (sign) {
7361 if (fill != ' ')
7362 *res++ = sign;
7363 rescnt--;
7364 if (width > len)
7365 width--;
7366 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007367 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7368 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007369 assert(pbuf[1] == c);
7370 if (fill != ' ') {
7371 *res++ = *pbuf++;
7372 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007373 }
Tim Petersfff53252001-04-12 18:38:48 +00007374 rescnt -= 2;
7375 width -= 2;
7376 if (width < 0)
7377 width = 0;
7378 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007379 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380 if (width > len && !(flags & F_LJUST)) {
7381 do {
7382 --rescnt;
7383 *res++ = fill;
7384 } while (--width > len);
7385 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007386 if (fill == ' ') {
7387 if (sign)
7388 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007389 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007390 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007391 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007392 *res++ = *pbuf++;
7393 *res++ = *pbuf++;
7394 }
7395 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007396 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397 res += len;
7398 rescnt -= len;
7399 while (--width >= len) {
7400 --rescnt;
7401 *res++ = ' ';
7402 }
7403 if (dict && (argidx < arglen) && c != '%') {
7404 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007405 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007406 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407 goto onError;
7408 }
7409 Py_XDECREF(temp);
7410 } /* '%' */
7411 } /* until end */
7412 if (argidx < arglen && !dict) {
7413 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007414 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415 goto onError;
7416 }
7417
Thomas Woutersa96affe2006-03-12 00:29:36 +00007418 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7419 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 if (args_owned) {
7421 Py_DECREF(args);
7422 }
7423 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424 return (PyObject *)result;
7425
7426 onError:
7427 Py_XDECREF(result);
7428 Py_DECREF(uformat);
7429 if (args_owned) {
7430 Py_DECREF(args);
7431 }
7432 return NULL;
7433}
7434
7435static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007436 (readbufferproc) unicode_buffer_getreadbuf,
7437 (writebufferproc) unicode_buffer_getwritebuf,
7438 (segcountproc) unicode_buffer_getsegcount,
7439 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440};
7441
Jeremy Hylton938ace62002-07-17 16:30:39 +00007442static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007443unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7444
Tim Peters6d6c1a32001-08-02 04:15:00 +00007445static PyObject *
7446unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7447{
7448 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007449 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007450 char *encoding = NULL;
7451 char *errors = NULL;
7452
Guido van Rossume023fe02001-08-30 03:12:59 +00007453 if (type != &PyUnicode_Type)
7454 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007455 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7456 kwlist, &x, &encoding, &errors))
7457 return NULL;
7458 if (x == NULL)
7459 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007460 if (encoding == NULL && errors == NULL)
7461 return PyObject_Unicode(x);
7462 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007463 return PyUnicode_FromEncodedObject(x, encoding, errors);
7464}
7465
Guido van Rossume023fe02001-08-30 03:12:59 +00007466static PyObject *
7467unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7468{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007469 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007470 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007471
7472 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7473 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7474 if (tmp == NULL)
7475 return NULL;
7476 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007477 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007478 if (pnew == NULL) {
7479 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007480 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007481 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007482 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7483 if (pnew->str == NULL) {
7484 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007485 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007486 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007487 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007488 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007489 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7490 pnew->length = n;
7491 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007492 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007493 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007494}
7495
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007496PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007497"unicode(string [, encoding[, errors]]) -> object\n\
7498\n\
7499Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007500encoding defaults to the current default string encoding.\n\
7501errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007502
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503PyTypeObject PyUnicode_Type = {
7504 PyObject_HEAD_INIT(&PyType_Type)
7505 0, /* ob_size */
7506 "unicode", /* tp_name */
7507 sizeof(PyUnicodeObject), /* tp_size */
7508 0, /* tp_itemsize */
7509 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007510 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007512 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513 0, /* tp_setattr */
7514 (cmpfunc) unicode_compare, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007515 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007516 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007518 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519 (hashfunc) unicode_hash, /* tp_hash*/
7520 0, /* tp_call*/
7521 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007522 PyObject_GenericGetAttr, /* tp_getattro */
7523 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007525 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7526 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007527 unicode_doc, /* tp_doc */
7528 0, /* tp_traverse */
7529 0, /* tp_clear */
7530 0, /* tp_richcompare */
7531 0, /* tp_weaklistoffset */
7532 0, /* tp_iter */
7533 0, /* tp_iternext */
7534 unicode_methods, /* tp_methods */
7535 0, /* tp_members */
7536 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007537 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007538 0, /* tp_dict */
7539 0, /* tp_descr_get */
7540 0, /* tp_descr_set */
7541 0, /* tp_dictoffset */
7542 0, /* tp_init */
7543 0, /* tp_alloc */
7544 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007545 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546};
7547
7548/* Initialize the Unicode implementation */
7549
Thomas Wouters78890102000-07-22 19:25:51 +00007550void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007552 int i;
7553
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007554 /* XXX - move this array to unicodectype.c ? */
7555 Py_UNICODE linebreak[] = {
7556 0x000A, /* LINE FEED */
7557 0x000D, /* CARRIAGE RETURN */
7558 0x001C, /* FILE SEPARATOR */
7559 0x001D, /* GROUP SEPARATOR */
7560 0x001E, /* RECORD SEPARATOR */
7561 0x0085, /* NEXT LINE */
7562 0x2028, /* LINE SEPARATOR */
7563 0x2029, /* PARAGRAPH SEPARATOR */
7564 };
7565
Fred Drakee4315f52000-05-09 19:53:39 +00007566 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007567 unicode_freelist = NULL;
7568 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007570 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007571 for (i = 0; i < 256; i++)
7572 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007573 if (PyType_Ready(&PyUnicode_Type) < 0)
7574 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007575
7576 /* initialize the linebreak bloom filter */
7577 bloom_linebreak = make_bloom_mask(
7578 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
7579 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580}
7581
7582/* Finalize the Unicode implementation */
7583
7584void
Thomas Wouters78890102000-07-22 19:25:51 +00007585_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007587 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007588 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007590 Py_XDECREF(unicode_empty);
7591 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007592
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007593 for (i = 0; i < 256; i++) {
7594 if (unicode_latin1[i]) {
7595 Py_DECREF(unicode_latin1[i]);
7596 unicode_latin1[i] = NULL;
7597 }
7598 }
7599
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007600 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601 PyUnicodeObject *v = u;
7602 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007603 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007604 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007605 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007606 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007608 unicode_freelist = NULL;
7609 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007611
Anthony Baxterac6bd462006-04-13 02:06:09 +00007612#ifdef __cplusplus
7613}
7614#endif
7615
7616
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007617/*
7618Local variables:
7619c-basic-offset: 4
7620indent-tabs-mode: nil
7621End:
7622*/