blob: 485e3607d1890ebd3bc480fce27d91bafece6018 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
Martin v. Löwis5cb69362006-04-14 09:08:42 +000039#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000040#include "Python.h"
41
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Fredrik Lundhb63588c2006-05-23 18:44:25 +000049#undef USE_INLINE /* XXX - set via configure? */
50
51#if defined(_MSC_VER) /* this is taken from _sre.c */
52#pragma warning(disable: 4710)
53/* fastest possible local call under MSVC */
54#define LOCAL(type) static __inline type __fastcall
55#elif defined(USE_INLINE)
56#define LOCAL(type) static inline type
57#else
58#define LOCAL(type) static type
59#endif
60
Guido van Rossumd57fd912000-03-10 22:53:23 +000061/* Limit for the Unicode object free list */
62
63#define MAX_UNICODE_FREELIST_SIZE 1024
64
65/* Limit for the Unicode object free list stay alive optimization.
66
67 The implementation will keep allocated Unicode memory intact for
68 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000069 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
Barry Warsaw51ac5802000-03-20 16:36:48 +000071 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000072 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000073 malloc()-overhead) bytes of unused garbage.
74
75 Setting the limit to 0 effectively turns the feature off.
76
Guido van Rossumfd4b9572000-04-10 13:51:10 +000077 Note: This is an experimental feature ! If you get core dumps when
78 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000079
80*/
81
Guido van Rossumfd4b9572000-04-10 13:51:10 +000082#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000083
84/* Endianness switches; defaults to little endian */
85
86#ifdef WORDS_BIGENDIAN
87# define BYTEORDER_IS_BIG_ENDIAN
88#else
89# define BYTEORDER_IS_LITTLE_ENDIAN
90#endif
91
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000092/* --- Globals ------------------------------------------------------------
93
94 The globals are initialized by the _PyUnicode_Init() API and should
95 not be used before calling that API.
96
97*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Anthony Baxterac6bd462006-04-13 02:06:09 +000099
100#ifdef __cplusplus
101extern "C" {
102#endif
103
Guido van Rossumd57fd912000-03-10 22:53:23 +0000104/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000105static PyUnicodeObject *unicode_freelist;
106static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000107
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000108/* The empty Unicode object is shared to improve performance. */
109static PyUnicodeObject *unicode_empty;
110
111/* Single character Unicode strings in the Latin-1 range are being
112 shared as well. */
113static PyUnicodeObject *unicode_latin1[256];
114
Fred Drakee4315f52000-05-09 19:53:39 +0000115/* Default encoding to use and assume when NULL is passed as encoding
116 parameter; it is initialized by _PyUnicode_Init().
117
118 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000119 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000120
121*/
Fred Drakee4315f52000-05-09 19:53:39 +0000122static char unicode_default_encoding[100];
123
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000124Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000125PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000126{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000127#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000128 return 0x10FFFF;
129#else
130 /* This is actually an illegal character, so it should
131 not be passed to unichr. */
132 return 0xFFFF;
133#endif
134}
135
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000136/* --- Bloom Filters ----------------------------------------------------- */
137
138/* stuff to implement simple "bloom filters" for Unicode characters.
139 to keep things simple, we use a single bitmask, using the least 5
140 bits from each unicode characters as the bit index. */
141
142/* the linebreak mask is set up by Unicode_Init below */
143
144#define BLOOM_MASK unsigned long
145
146static BLOOM_MASK bloom_linebreak;
147
148#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
149
150#define BLOOM_LINEBREAK(ch)\
151 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
152
153LOCAL(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
154{
155 /* calculate simple bloom-style bitmask for a given unicode string */
156
157 long mask;
158 Py_ssize_t i;
159
160 mask = 0;
161 for (i = 0; i < len; i++)
162 mask |= (1 << (ptr[i] & 0x1F));
163
164 return mask;
165}
166
167LOCAL(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
168{
169 Py_ssize_t i;
170
171 for (i = 0; i < setlen; i++)
172 if (set[i] == chr)
173 return 1;
174
Fredrik Lundh77633512006-05-23 19:47:35 +0000175 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000176}
177
178#define BLOOM_MEMBER(mask, chr, set, setlen)\
179 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
180
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181/* --- Unicode Object ----------------------------------------------------- */
182
183static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000185 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000186{
187 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000188
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000189 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000191 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 /* Resizing shared object (unicode_empty or single character
194 objects) in-place is not allowed. Use PyUnicode_Resize()
195 instead ! */
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000196 if (unicode == unicode_empty ||
197 (unicode->length == 1 &&
198 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 return -1;
203 }
204
205 /* We allocate one more byte to make sure the string is
206 Ux0000 terminated -- XXX is this needed ? */
207 oldstr = unicode->str;
208 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
209 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000210 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 PyErr_NoMemory();
212 return -1;
213 }
214 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000215 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000217 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000219 if (unicode->defenc) {
220 Py_DECREF(unicode->defenc);
221 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000222 }
223 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000224
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 return 0;
226}
227
228/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000229 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230
231 XXX This allocator could further be enhanced by assuring that the
232 free list never reduces its size below 1.
233
234*/
235
236static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000237PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238{
239 register PyUnicodeObject *unicode;
240
Tim Petersced69f82003-09-16 20:30:58 +0000241 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 if (length == 0 && unicode_empty != NULL) {
243 Py_INCREF(unicode_empty);
244 return unicode_empty;
245 }
246
247 /* Unicode freelist & memory allocation */
248 if (unicode_freelist) {
249 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000250 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 /* Keep-Alive optimization: we only upsize the buffer,
254 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000255 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000256 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000257 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000258 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 }
260 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000261 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000263 }
264 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265 }
266 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000267 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268 if (unicode == NULL)
269 return NULL;
270 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
271 }
272
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000273 if (!unicode->str) {
274 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000275 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000276 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000277 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000278 * the caller fails before initializing str -- unicode_resize()
279 * reads str[0], and the Keep-Alive optimization can keep memory
280 * allocated for str alive across a call to unicode_dealloc(unicode).
281 * We don't want unicode_resize to read uninitialized memory in
282 * that case.
283 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000284 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000285 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000286 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000288 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000290
291 onError:
292 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000293 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000294 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000295}
296
297static
Guido van Rossum9475a232001-10-05 20:51:39 +0000298void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000300 if (PyUnicode_CheckExact(unicode) &&
301 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 /* Keep-Alive optimization */
303 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000304 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 unicode->str = NULL;
306 unicode->length = 0;
307 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000308 if (unicode->defenc) {
309 Py_DECREF(unicode->defenc);
310 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000311 }
312 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313 *(PyUnicodeObject **)unicode = unicode_freelist;
314 unicode_freelist = unicode;
315 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000318 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000319 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000320 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321 }
322}
323
Martin v. Löwis18e16552006-02-15 17:27:45 +0000324int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000325{
326 register PyUnicodeObject *v;
327
328 /* Argument checks */
329 if (unicode == NULL) {
330 PyErr_BadInternalCall();
331 return -1;
332 }
333 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000334 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000335 PyErr_BadInternalCall();
336 return -1;
337 }
338
339 /* Resizing unicode_empty and single character objects is not
340 possible since these are being shared. We simply return a fresh
341 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000342 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000343 (v == unicode_empty || v->length == 1)) {
344 PyUnicodeObject *w = _PyUnicode_New(length);
345 if (w == NULL)
346 return -1;
347 Py_UNICODE_COPY(w->str, v->str,
348 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000349 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000350 *unicode = (PyObject *)w;
351 return 0;
352 }
353
354 /* Note that we don't have to modify *unicode for unshared Unicode
355 objects, since we can modify them in-place. */
356 return unicode_resize(v, length);
357}
358
359/* Internal API for use in unicodeobject.c only ! */
360#define _PyUnicode_Resize(unicodevar, length) \
361 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
362
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000364 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365{
366 PyUnicodeObject *unicode;
367
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000368 /* If the Unicode data is known at construction time, we can apply
369 some optimizations which share commonly used objects. */
370 if (u != NULL) {
371
372 /* Optimization for empty strings */
373 if (size == 0 && unicode_empty != NULL) {
374 Py_INCREF(unicode_empty);
375 return (PyObject *)unicode_empty;
376 }
377
378 /* Single character Unicode objects in the Latin-1 range are
379 shared when using this constructor */
380 if (size == 1 && *u < 256) {
381 unicode = unicode_latin1[*u];
382 if (!unicode) {
383 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000384 if (!unicode)
385 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000386 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000387 unicode_latin1[*u] = unicode;
388 }
389 Py_INCREF(unicode);
390 return (PyObject *)unicode;
391 }
392 }
Tim Petersced69f82003-09-16 20:30:58 +0000393
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394 unicode = _PyUnicode_New(size);
395 if (!unicode)
396 return NULL;
397
398 /* Copy the Unicode data into the new object */
399 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000400 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401
402 return (PyObject *)unicode;
403}
404
405#ifdef HAVE_WCHAR_H
406
407PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000408 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000409{
410 PyUnicodeObject *unicode;
411
412 if (w == NULL) {
413 PyErr_BadInternalCall();
414 return NULL;
415 }
416
417 unicode = _PyUnicode_New(size);
418 if (!unicode)
419 return NULL;
420
421 /* Copy the wchar_t data into the new object */
422#ifdef HAVE_USABLE_WCHAR_T
423 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000424#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 {
426 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000427 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000429 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430 *u++ = *w++;
431 }
432#endif
433
434 return (PyObject *)unicode;
435}
436
Martin v. Löwis18e16552006-02-15 17:27:45 +0000437Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
438 wchar_t *w,
439 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440{
441 if (unicode == NULL) {
442 PyErr_BadInternalCall();
443 return -1;
444 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000445
446 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000447 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000448 size = PyUnicode_GET_SIZE(unicode) + 1;
449
Guido van Rossumd57fd912000-03-10 22:53:23 +0000450#ifdef HAVE_USABLE_WCHAR_T
451 memcpy(w, unicode->str, size * sizeof(wchar_t));
452#else
453 {
454 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000455 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000457 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000458 *w++ = *u++;
459 }
460#endif
461
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000462 if (size > PyUnicode_GET_SIZE(unicode))
463 return PyUnicode_GET_SIZE(unicode);
464 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000465 return size;
466}
467
468#endif
469
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000470PyObject *PyUnicode_FromOrdinal(int ordinal)
471{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000472 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000473
474#ifdef Py_UNICODE_WIDE
475 if (ordinal < 0 || ordinal > 0x10ffff) {
476 PyErr_SetString(PyExc_ValueError,
477 "unichr() arg not in range(0x110000) "
478 "(wide Python build)");
479 return NULL;
480 }
481#else
482 if (ordinal < 0 || ordinal > 0xffff) {
483 PyErr_SetString(PyExc_ValueError,
484 "unichr() arg not in range(0x10000) "
485 "(narrow Python build)");
486 return NULL;
487 }
488#endif
489
Hye-Shik Chang40574832004-04-06 07:24:51 +0000490 s[0] = (Py_UNICODE)ordinal;
491 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000492}
493
Guido van Rossumd57fd912000-03-10 22:53:23 +0000494PyObject *PyUnicode_FromObject(register PyObject *obj)
495{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000496 /* XXX Perhaps we should make this API an alias of
497 PyObject_Unicode() instead ?! */
498 if (PyUnicode_CheckExact(obj)) {
499 Py_INCREF(obj);
500 return obj;
501 }
502 if (PyUnicode_Check(obj)) {
503 /* For a Unicode subtype that's not a Unicode object,
504 return a true Unicode object with the same data. */
505 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
506 PyUnicode_GET_SIZE(obj));
507 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
509}
510
511PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
512 const char *encoding,
513 const char *errors)
514{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000515 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000516 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000517 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000518
Guido van Rossumd57fd912000-03-10 22:53:23 +0000519 if (obj == NULL) {
520 PyErr_BadInternalCall();
521 return NULL;
522 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000523
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000524#if 0
525 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000526 that no encodings is given and then redirect to
527 PyObject_Unicode() which then applies the additional logic for
528 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000530 NOTE: This API should really only be used for object which
531 represent *encoded* Unicode !
532
533 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 if (PyUnicode_Check(obj)) {
535 if (encoding) {
536 PyErr_SetString(PyExc_TypeError,
537 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000538 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000539 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000540 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000541 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000542#else
543 if (PyUnicode_Check(obj)) {
544 PyErr_SetString(PyExc_TypeError,
545 "decoding Unicode is not supported");
546 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000547 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000548#endif
549
550 /* Coerce object */
551 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000552 s = PyString_AS_STRING(obj);
553 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
556 /* Overwrite the error message with something more useful in
557 case of a TypeError. */
558 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000559 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000560 "coercing to Unicode: need string or buffer, "
561 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000562 obj->ob_type->tp_name);
563 goto onError;
564 }
Tim Petersced69f82003-09-16 20:30:58 +0000565
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000566 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567 if (len == 0) {
568 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000570 }
Tim Petersced69f82003-09-16 20:30:58 +0000571 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000573
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000574 return v;
575
576 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000577 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000578}
579
580PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000581 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582 const char *encoding,
583 const char *errors)
584{
585 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000586
587 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000588 encoding = PyUnicode_GetDefaultEncoding();
589
590 /* Shortcuts for common default encodings */
591 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000592 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000593 else if (strcmp(encoding, "latin-1") == 0)
594 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000595#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
596 else if (strcmp(encoding, "mbcs") == 0)
597 return PyUnicode_DecodeMBCS(s, size, errors);
598#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000599 else if (strcmp(encoding, "ascii") == 0)
600 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000601
602 /* Decode via the codec registry */
603 buffer = PyBuffer_FromMemory((void *)s, size);
604 if (buffer == NULL)
605 goto onError;
606 unicode = PyCodec_Decode(buffer, encoding, errors);
607 if (unicode == NULL)
608 goto onError;
609 if (!PyUnicode_Check(unicode)) {
610 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000611 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612 unicode->ob_type->tp_name);
613 Py_DECREF(unicode);
614 goto onError;
615 }
616 Py_DECREF(buffer);
617 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000618
Guido van Rossumd57fd912000-03-10 22:53:23 +0000619 onError:
620 Py_XDECREF(buffer);
621 return NULL;
622}
623
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000624PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
625 const char *encoding,
626 const char *errors)
627{
628 PyObject *v;
629
630 if (!PyUnicode_Check(unicode)) {
631 PyErr_BadArgument();
632 goto onError;
633 }
634
635 if (encoding == NULL)
636 encoding = PyUnicode_GetDefaultEncoding();
637
638 /* Decode via the codec registry */
639 v = PyCodec_Decode(unicode, encoding, errors);
640 if (v == NULL)
641 goto onError;
642 return v;
643
644 onError:
645 return NULL;
646}
647
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000649 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 const char *encoding,
651 const char *errors)
652{
653 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000654
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 unicode = PyUnicode_FromUnicode(s, size);
656 if (unicode == NULL)
657 return NULL;
658 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
659 Py_DECREF(unicode);
660 return v;
661}
662
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000663PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
664 const char *encoding,
665 const char *errors)
666{
667 PyObject *v;
668
669 if (!PyUnicode_Check(unicode)) {
670 PyErr_BadArgument();
671 goto onError;
672 }
673
674 if (encoding == NULL)
675 encoding = PyUnicode_GetDefaultEncoding();
676
677 /* Encode via the codec registry */
678 v = PyCodec_Encode(unicode, encoding, errors);
679 if (v == NULL)
680 goto onError;
681 return v;
682
683 onError:
684 return NULL;
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
688 const char *encoding,
689 const char *errors)
690{
691 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000692
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693 if (!PyUnicode_Check(unicode)) {
694 PyErr_BadArgument();
695 goto onError;
696 }
Fred Drakee4315f52000-05-09 19:53:39 +0000697
Tim Petersced69f82003-09-16 20:30:58 +0000698 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000699 encoding = PyUnicode_GetDefaultEncoding();
700
701 /* Shortcuts for common default encodings */
702 if (errors == NULL) {
703 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000704 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000705 else if (strcmp(encoding, "latin-1") == 0)
706 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000707#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
708 else if (strcmp(encoding, "mbcs") == 0)
709 return PyUnicode_AsMBCSString(unicode);
710#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000711 else if (strcmp(encoding, "ascii") == 0)
712 return PyUnicode_AsASCIIString(unicode);
713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714
715 /* Encode via the codec registry */
716 v = PyCodec_Encode(unicode, encoding, errors);
717 if (v == NULL)
718 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000719 if (!PyString_Check(v)) {
720 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000721 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000722 v->ob_type->tp_name);
723 Py_DECREF(v);
724 goto onError;
725 }
726 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000727
Guido van Rossumd57fd912000-03-10 22:53:23 +0000728 onError:
729 return NULL;
730}
731
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000732PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
733 const char *errors)
734{
735 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
736
737 if (v)
738 return v;
739 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
740 if (v && errors == NULL)
741 ((PyUnicodeObject *)unicode)->defenc = v;
742 return v;
743}
744
Guido van Rossumd57fd912000-03-10 22:53:23 +0000745Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
746{
747 if (!PyUnicode_Check(unicode)) {
748 PyErr_BadArgument();
749 goto onError;
750 }
751 return PyUnicode_AS_UNICODE(unicode);
752
753 onError:
754 return NULL;
755}
756
Martin v. Löwis18e16552006-02-15 17:27:45 +0000757Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000758{
759 if (!PyUnicode_Check(unicode)) {
760 PyErr_BadArgument();
761 goto onError;
762 }
763 return PyUnicode_GET_SIZE(unicode);
764
765 onError:
766 return -1;
767}
768
Thomas Wouters78890102000-07-22 19:25:51 +0000769const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000770{
771 return unicode_default_encoding;
772}
773
774int PyUnicode_SetDefaultEncoding(const char *encoding)
775{
776 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000777
Fred Drakee4315f52000-05-09 19:53:39 +0000778 /* Make sure the encoding is valid. As side effect, this also
779 loads the encoding into the codec registry cache. */
780 v = _PyCodec_Lookup(encoding);
781 if (v == NULL)
782 goto onError;
783 Py_DECREF(v);
784 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000785 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000786 sizeof(unicode_default_encoding));
787 return 0;
788
789 onError:
790 return -1;
791}
792
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000793/* error handling callback helper:
794 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000795 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000796 and adjust various state variables.
797 return 0 on success, -1 on error
798*/
799
800static
801int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
802 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000803 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
804 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000805{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000806 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000807
808 PyObject *restuple = NULL;
809 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000810 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
811 Py_ssize_t requiredsize;
812 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000813 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000814 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000815 int res = -1;
816
817 if (*errorHandler == NULL) {
818 *errorHandler = PyCodec_LookupError(errors);
819 if (*errorHandler == NULL)
820 goto onError;
821 }
822
823 if (*exceptionObject == NULL) {
824 *exceptionObject = PyUnicodeDecodeError_Create(
825 encoding, input, insize, *startinpos, *endinpos, reason);
826 if (*exceptionObject == NULL)
827 goto onError;
828 }
829 else {
830 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
831 goto onError;
832 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
833 goto onError;
834 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
835 goto onError;
836 }
837
838 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
839 if (restuple == NULL)
840 goto onError;
841 if (!PyTuple_Check(restuple)) {
842 PyErr_Format(PyExc_TypeError, &argparse[4]);
843 goto onError;
844 }
845 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
846 goto onError;
847 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000848 newpos = insize+newpos;
849 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000850 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000851 goto onError;
852 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000853
854 /* need more space? (at least enough for what we
855 have+the replacement+the rest of the string (starting
856 at the new input position), so we won't have to check space
857 when there are no errors in the rest of the string) */
858 repptr = PyUnicode_AS_UNICODE(repunicode);
859 repsize = PyUnicode_GET_SIZE(repunicode);
860 requiredsize = *outpos + repsize + insize-newpos;
861 if (requiredsize > outsize) {
862 if (requiredsize<2*outsize)
863 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000864 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000865 goto onError;
866 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
867 }
868 *endinpos = newpos;
869 *inptr = input + newpos;
870 Py_UNICODE_COPY(*outptr, repptr, repsize);
871 *outptr += repsize;
872 *outpos += repsize;
873 /* we made it! */
874 res = 0;
875
876 onError:
877 Py_XDECREF(restuple);
878 return res;
879}
880
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881/* --- UTF-7 Codec -------------------------------------------------------- */
882
883/* see RFC2152 for details */
884
Tim Petersced69f82003-09-16 20:30:58 +0000885static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000886char utf7_special[128] = {
887 /* indicate whether a UTF-7 character is special i.e. cannot be directly
888 encoded:
889 0 - not special
890 1 - special
891 2 - whitespace (optional)
892 3 - RFC2152 Set O (optional) */
893 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
894 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
895 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
897 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
898 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
899 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
901
902};
903
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000904/* Note: The comparison (c) <= 0 is a trick to work-around gcc
905 warnings about the comparison always being false; since
906 utf7_special[0] is 1, we can safely make that one comparison
907 true */
908
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000909#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000910 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000911 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000912 (encodeO && (utf7_special[(c)] == 3)))
913
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000914#define B64(n) \
915 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
916#define B64CHAR(c) \
917 (isalnum(c) || (c) == '+' || (c) == '/')
918#define UB64(c) \
919 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
920 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000922#define ENCODE(out, ch, bits) \
923 while (bits >= 6) { \
924 *out++ = B64(ch >> (bits-6)); \
925 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000926 }
927
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000928#define DECODE(out, ch, bits, surrogate) \
929 while (bits >= 16) { \
930 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
931 bits -= 16; \
932 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000933 /* We have already generated an error for the high surrogate \
934 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000935 surrogate = 0; \
936 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000937 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000938 it in a 16-bit character */ \
939 surrogate = 1; \
940 errmsg = "code pairs are not supported"; \
941 goto utf7Error; \
942 } else { \
943 *out++ = outCh; \
944 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000945 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000946
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000947PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000948 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 const char *errors)
950{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000951 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000952 Py_ssize_t startinpos;
953 Py_ssize_t endinpos;
954 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000955 const char *e;
956 PyUnicodeObject *unicode;
957 Py_UNICODE *p;
958 const char *errmsg = "";
959 int inShift = 0;
960 unsigned int bitsleft = 0;
961 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000962 int surrogate = 0;
963 PyObject *errorHandler = NULL;
964 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965
966 unicode = _PyUnicode_New(size);
967 if (!unicode)
968 return NULL;
969 if (size == 0)
970 return (PyObject *)unicode;
971
972 p = unicode->str;
973 e = s + size;
974
975 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000976 Py_UNICODE ch;
977 restart:
978 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000979
980 if (inShift) {
981 if ((ch == '-') || !B64CHAR(ch)) {
982 inShift = 0;
983 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000984
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000985 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
986 if (bitsleft >= 6) {
987 /* The shift sequence has a partial character in it. If
988 bitsleft < 6 then we could just classify it as padding
989 but that is not the case here */
990
991 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000992 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000993 }
994 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000995 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000996 here so indicate the potential of a misencoded character. */
997
998 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
999 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1000 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001001 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 }
1003
1004 if (ch == '-') {
1005 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001006 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 inShift = 1;
1008 }
1009 } else if (SPECIAL(ch,0,0)) {
1010 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001011 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001012 } else {
1013 *p++ = ch;
1014 }
1015 } else {
1016 charsleft = (charsleft << 6) | UB64(ch);
1017 bitsleft += 6;
1018 s++;
1019 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1020 }
1021 }
1022 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001023 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001024 s++;
1025 if (s < e && *s == '-') {
1026 s++;
1027 *p++ = '+';
1028 } else
1029 {
1030 inShift = 1;
1031 bitsleft = 0;
1032 }
1033 }
1034 else if (SPECIAL(ch,0,0)) {
1035 errmsg = "unexpected special character";
1036 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001037 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001038 }
1039 else {
1040 *p++ = ch;
1041 s++;
1042 }
1043 continue;
1044 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001045 outpos = p-PyUnicode_AS_UNICODE(unicode);
1046 endinpos = s-starts;
1047 if (unicode_decode_call_errorhandler(
1048 errors, &errorHandler,
1049 "utf7", errmsg,
1050 starts, size, &startinpos, &endinpos, &exc, &s,
1051 (PyObject **)&unicode, &outpos, &p))
1052 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001053 }
1054
1055 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001056 outpos = p-PyUnicode_AS_UNICODE(unicode);
1057 endinpos = size;
1058 if (unicode_decode_call_errorhandler(
1059 errors, &errorHandler,
1060 "utf7", "unterminated shift sequence",
1061 starts, size, &startinpos, &endinpos, &exc, &s,
1062 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001063 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001064 if (s < e)
1065 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001066 }
1067
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001068 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001069 goto onError;
1070
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071 Py_XDECREF(errorHandler);
1072 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 return (PyObject *)unicode;
1074
1075onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001076 Py_XDECREF(errorHandler);
1077 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001078 Py_DECREF(unicode);
1079 return NULL;
1080}
1081
1082
1083PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001084 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001085 int encodeSetO,
1086 int encodeWhiteSpace,
1087 const char *errors)
1088{
1089 PyObject *v;
1090 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001091 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001092 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001093 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001094 unsigned int bitsleft = 0;
1095 unsigned long charsleft = 0;
1096 char * out;
1097 char * start;
1098
1099 if (size == 0)
1100 return PyString_FromStringAndSize(NULL, 0);
1101
1102 v = PyString_FromStringAndSize(NULL, cbAllocated);
1103 if (v == NULL)
1104 return NULL;
1105
1106 start = out = PyString_AS_STRING(v);
1107 for (;i < size; ++i) {
1108 Py_UNICODE ch = s[i];
1109
1110 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001111 if (ch == '+') {
1112 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001113 *out++ = '-';
1114 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1115 charsleft = ch;
1116 bitsleft = 16;
1117 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001118 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001120 } else {
1121 *out++ = (char) ch;
1122 }
1123 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001124 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1125 *out++ = B64(charsleft << (6-bitsleft));
1126 charsleft = 0;
1127 bitsleft = 0;
1128 /* Characters not in the BASE64 set implicitly unshift the sequence
1129 so no '-' is required, except if the character is itself a '-' */
1130 if (B64CHAR(ch) || ch == '-') {
1131 *out++ = '-';
1132 }
1133 inShift = 0;
1134 *out++ = (char) ch;
1135 } else {
1136 bitsleft += 16;
1137 charsleft = (charsleft << 16) | ch;
1138 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1139
1140 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001141 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001142 or '-' then the shift sequence will be terminated implicitly and we
1143 don't have to insert a '-'. */
1144
1145 if (bitsleft == 0) {
1146 if (i + 1 < size) {
1147 Py_UNICODE ch2 = s[i+1];
1148
1149 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001150
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001151 } else if (B64CHAR(ch2) || ch2 == '-') {
1152 *out++ = '-';
1153 inShift = 0;
1154 } else {
1155 inShift = 0;
1156 }
1157
1158 }
1159 else {
1160 *out++ = '-';
1161 inShift = 0;
1162 }
1163 }
Tim Petersced69f82003-09-16 20:30:58 +00001164 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001165 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001166 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001167 if (bitsleft) {
1168 *out++= B64(charsleft << (6-bitsleft) );
1169 *out++ = '-';
1170 }
1171
Tim Peters5de98422002-04-27 18:44:32 +00001172 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001173 return v;
1174}
1175
1176#undef SPECIAL
1177#undef B64
1178#undef B64CHAR
1179#undef UB64
1180#undef ENCODE
1181#undef DECODE
1182
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183/* --- UTF-8 Codec -------------------------------------------------------- */
1184
Tim Petersced69f82003-09-16 20:30:58 +00001185static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186char utf8_code_length[256] = {
1187 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1188 illegal prefix. see RFC 2279 for details */
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1193 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1194 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1195 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1196 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1201 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1202 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1203 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1204 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1205};
1206
Guido van Rossumd57fd912000-03-10 22:53:23 +00001207PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001208 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 const char *errors)
1210{
Walter Dörwald69652032004-09-07 20:24:22 +00001211 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1212}
1213
1214PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001215 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001216 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001217 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001218{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001219 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001221 Py_ssize_t startinpos;
1222 Py_ssize_t endinpos;
1223 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 const char *e;
1225 PyUnicodeObject *unicode;
1226 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001227 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001228 PyObject *errorHandler = NULL;
1229 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230
1231 /* Note: size will always be longer than the resulting Unicode
1232 character count */
1233 unicode = _PyUnicode_New(size);
1234 if (!unicode)
1235 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001236 if (size == 0) {
1237 if (consumed)
1238 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001241
1242 /* Unpack UTF-8 encoded data */
1243 p = unicode->str;
1244 e = s + size;
1245
1246 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001247 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248
1249 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001250 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 s++;
1252 continue;
1253 }
1254
1255 n = utf8_code_length[ch];
1256
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001257 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001258 if (consumed)
1259 break;
1260 else {
1261 errmsg = "unexpected end of data";
1262 startinpos = s-starts;
1263 endinpos = size;
1264 goto utf8Error;
1265 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001266 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267
1268 switch (n) {
1269
1270 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001271 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001272 startinpos = s-starts;
1273 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001274 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275
1276 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001277 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278 startinpos = s-starts;
1279 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001280 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281
1282 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001283 if ((s[1] & 0xc0) != 0x80) {
1284 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001285 startinpos = s-starts;
1286 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001287 goto utf8Error;
1288 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001290 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001291 startinpos = s-starts;
1292 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001293 errmsg = "illegal encoding";
1294 goto utf8Error;
1295 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001298 break;
1299
1300 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001301 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001302 (s[2] & 0xc0) != 0x80) {
1303 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001304 startinpos = s-starts;
1305 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001306 goto utf8Error;
1307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001309 if (ch < 0x0800) {
1310 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001311 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001312
1313 XXX For wide builds (UCS-4) we should probably try
1314 to recombine the surrogates into a single code
1315 unit.
1316 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001317 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001318 startinpos = s-starts;
1319 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001320 goto utf8Error;
1321 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001323 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001324 break;
1325
1326 case 4:
1327 if ((s[1] & 0xc0) != 0x80 ||
1328 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001329 (s[3] & 0xc0) != 0x80) {
1330 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331 startinpos = s-starts;
1332 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001333 goto utf8Error;
1334 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001335 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1336 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1337 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001338 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001339 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001340 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001341 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001342 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001343 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344 startinpos = s-starts;
1345 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001346 goto utf8Error;
1347 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001348#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001349 *p++ = (Py_UNICODE)ch;
1350#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001351 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001352
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001353 /* translate from 10000..10FFFF to 0..FFFF */
1354 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001355
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001356 /* high surrogate = top 10 bits added to D800 */
1357 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001358
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001359 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001360 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001361#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362 break;
1363
1364 default:
1365 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001366 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001367 startinpos = s-starts;
1368 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370 }
1371 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001372 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001373
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001374 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001375 outpos = p-PyUnicode_AS_UNICODE(unicode);
1376 if (unicode_decode_call_errorhandler(
1377 errors, &errorHandler,
1378 "utf8", errmsg,
1379 starts, size, &startinpos, &endinpos, &exc, &s,
1380 (PyObject **)&unicode, &outpos, &p))
1381 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 }
Walter Dörwald69652032004-09-07 20:24:22 +00001383 if (consumed)
1384 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001385
1386 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001387 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001388 goto onError;
1389
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 Py_XDECREF(errorHandler);
1391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 return (PyObject *)unicode;
1393
1394onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001395 Py_XDECREF(errorHandler);
1396 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001397 Py_DECREF(unicode);
1398 return NULL;
1399}
1400
Tim Peters602f7402002-04-27 18:03:26 +00001401/* Allocation strategy: if the string is short, convert into a stack buffer
1402 and allocate exactly as much space needed at the end. Else allocate the
1403 maximum possible needed (4 result bytes per Unicode character), and return
1404 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001405*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001406PyObject *
1407PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001408 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001409 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410{
Tim Peters602f7402002-04-27 18:03:26 +00001411#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001412
Martin v. Löwis18e16552006-02-15 17:27:45 +00001413 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001414 PyObject *v; /* result string object */
1415 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001416 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001417 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001418 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001419
Tim Peters602f7402002-04-27 18:03:26 +00001420 assert(s != NULL);
1421 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001422
Tim Peters602f7402002-04-27 18:03:26 +00001423 if (size <= MAX_SHORT_UNICHARS) {
1424 /* Write into the stack buffer; nallocated can't overflow.
1425 * At the end, we'll allocate exactly as much heap space as it
1426 * turns out we need.
1427 */
1428 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1429 v = NULL; /* will allocate after we're done */
1430 p = stackbuf;
1431 }
1432 else {
1433 /* Overallocate on the heap, and give the excess back at the end. */
1434 nallocated = size * 4;
1435 if (nallocated / 4 != size) /* overflow! */
1436 return PyErr_NoMemory();
1437 v = PyString_FromStringAndSize(NULL, nallocated);
1438 if (v == NULL)
1439 return NULL;
1440 p = PyString_AS_STRING(v);
1441 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001442
Tim Peters602f7402002-04-27 18:03:26 +00001443 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001444 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001445
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001446 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001447 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001448 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001449
Guido van Rossumd57fd912000-03-10 22:53:23 +00001450 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001451 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001452 *p++ = (char)(0xc0 | (ch >> 6));
1453 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001454 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001455 else {
Tim Peters602f7402002-04-27 18:03:26 +00001456 /* Encode UCS2 Unicode ordinals */
1457 if (ch < 0x10000) {
1458 /* Special case: check for high surrogate */
1459 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1460 Py_UCS4 ch2 = s[i];
1461 /* Check for low surrogate and combine the two to
1462 form a UCS4 value */
1463 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001464 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001465 i++;
1466 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001467 }
Tim Peters602f7402002-04-27 18:03:26 +00001468 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001469 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001470 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001471 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1472 *p++ = (char)(0x80 | (ch & 0x3f));
1473 continue;
1474 }
1475encodeUCS4:
1476 /* Encode UCS4 Unicode ordinals */
1477 *p++ = (char)(0xf0 | (ch >> 18));
1478 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1479 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1480 *p++ = (char)(0x80 | (ch & 0x3f));
1481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001482 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001483
Tim Peters602f7402002-04-27 18:03:26 +00001484 if (v == NULL) {
1485 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001486 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001487 assert(nneeded <= nallocated);
1488 v = PyString_FromStringAndSize(stackbuf, nneeded);
1489 }
1490 else {
1491 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001492 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001493 assert(nneeded <= nallocated);
1494 _PyString_Resize(&v, nneeded);
1495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001497
Tim Peters602f7402002-04-27 18:03:26 +00001498#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001499}
1500
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1502{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503 if (!PyUnicode_Check(unicode)) {
1504 PyErr_BadArgument();
1505 return NULL;
1506 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001507 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1508 PyUnicode_GET_SIZE(unicode),
1509 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001510}
1511
1512/* --- UTF-16 Codec ------------------------------------------------------- */
1513
Tim Peters772747b2001-08-09 22:21:55 +00001514PyObject *
1515PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001516 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001517 const char *errors,
1518 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519{
Walter Dörwald69652032004-09-07 20:24:22 +00001520 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1521}
1522
1523PyObject *
1524PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001525 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001526 const char *errors,
1527 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001528 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001529{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001530 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001531 Py_ssize_t startinpos;
1532 Py_ssize_t endinpos;
1533 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001534 PyUnicodeObject *unicode;
1535 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001536 const unsigned char *q, *e;
1537 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001538 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001539 /* Offsets from q for retrieving byte pairs in the right order. */
1540#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1541 int ihi = 1, ilo = 0;
1542#else
1543 int ihi = 0, ilo = 1;
1544#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001545 PyObject *errorHandler = NULL;
1546 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547
1548 /* Note: size will always be longer than the resulting Unicode
1549 character count */
1550 unicode = _PyUnicode_New(size);
1551 if (!unicode)
1552 return NULL;
1553 if (size == 0)
1554 return (PyObject *)unicode;
1555
1556 /* Unpack UTF-16 encoded data */
1557 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001558 q = (unsigned char *)s;
1559 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560
1561 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001562 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001564 /* Check for BOM marks (U+FEFF) in the input and adjust current
1565 byte order setting accordingly. In native mode, the leading BOM
1566 mark is skipped, in all other modes, it is copied to the output
1567 stream as-is (giving a ZWNBSP character). */
1568 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001569 if (size >= 2) {
1570 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001571#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001572 if (bom == 0xFEFF) {
1573 q += 2;
1574 bo = -1;
1575 }
1576 else if (bom == 0xFFFE) {
1577 q += 2;
1578 bo = 1;
1579 }
Tim Petersced69f82003-09-16 20:30:58 +00001580#else
Walter Dörwald69652032004-09-07 20:24:22 +00001581 if (bom == 0xFEFF) {
1582 q += 2;
1583 bo = 1;
1584 }
1585 else if (bom == 0xFFFE) {
1586 q += 2;
1587 bo = -1;
1588 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001589#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001590 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592
Tim Peters772747b2001-08-09 22:21:55 +00001593 if (bo == -1) {
1594 /* force LE */
1595 ihi = 1;
1596 ilo = 0;
1597 }
1598 else if (bo == 1) {
1599 /* force BE */
1600 ihi = 0;
1601 ilo = 1;
1602 }
1603
1604 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001605 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001606 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001607 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001608 if (consumed)
1609 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001610 errmsg = "truncated data";
1611 startinpos = ((const char *)q)-starts;
1612 endinpos = ((const char *)e)-starts;
1613 goto utf16Error;
1614 /* The remaining input chars are ignored if the callback
1615 chooses to skip the input */
1616 }
1617 ch = (q[ihi] << 8) | q[ilo];
1618
Tim Peters772747b2001-08-09 22:21:55 +00001619 q += 2;
1620
Guido van Rossumd57fd912000-03-10 22:53:23 +00001621 if (ch < 0xD800 || ch > 0xDFFF) {
1622 *p++ = ch;
1623 continue;
1624 }
1625
1626 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001627 if (q >= e) {
1628 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001629 startinpos = (((const char *)q)-2)-starts;
1630 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001631 goto utf16Error;
1632 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001633 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001634 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1635 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001636 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001637#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001638 *p++ = ch;
1639 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001640#else
1641 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001642#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001643 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001644 }
1645 else {
1646 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001647 startinpos = (((const char *)q)-4)-starts;
1648 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001649 goto utf16Error;
1650 }
1651
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001653 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001654 startinpos = (((const char *)q)-2)-starts;
1655 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001656 /* Fall through to report the error */
1657
1658 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001659 outpos = p-PyUnicode_AS_UNICODE(unicode);
1660 if (unicode_decode_call_errorhandler(
1661 errors, &errorHandler,
1662 "utf16", errmsg,
1663 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1664 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001665 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001666 }
1667
1668 if (byteorder)
1669 *byteorder = bo;
1670
Walter Dörwald69652032004-09-07 20:24:22 +00001671 if (consumed)
1672 *consumed = (const char *)q-starts;
1673
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001675 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001676 goto onError;
1677
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001678 Py_XDECREF(errorHandler);
1679 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001680 return (PyObject *)unicode;
1681
1682onError:
1683 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001684 Py_XDECREF(errorHandler);
1685 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686 return NULL;
1687}
1688
Tim Peters772747b2001-08-09 22:21:55 +00001689PyObject *
1690PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001691 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001692 const char *errors,
1693 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694{
1695 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001696 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001697#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001698 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001699#else
1700 const int pairs = 0;
1701#endif
Tim Peters772747b2001-08-09 22:21:55 +00001702 /* Offsets from p for storing byte pairs in the right order. */
1703#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1704 int ihi = 1, ilo = 0;
1705#else
1706 int ihi = 0, ilo = 1;
1707#endif
1708
1709#define STORECHAR(CH) \
1710 do { \
1711 p[ihi] = ((CH) >> 8) & 0xff; \
1712 p[ilo] = (CH) & 0xff; \
1713 p += 2; \
1714 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001716#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001717 for (i = pairs = 0; i < size; i++)
1718 if (s[i] >= 0x10000)
1719 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001720#endif
Tim Petersced69f82003-09-16 20:30:58 +00001721 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001722 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723 if (v == NULL)
1724 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725
Tim Peters772747b2001-08-09 22:21:55 +00001726 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001728 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001729 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001730 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001731
1732 if (byteorder == -1) {
1733 /* force LE */
1734 ihi = 1;
1735 ilo = 0;
1736 }
1737 else if (byteorder == 1) {
1738 /* force BE */
1739 ihi = 0;
1740 ilo = 1;
1741 }
1742
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001743 while (size-- > 0) {
1744 Py_UNICODE ch = *s++;
1745 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001746#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001747 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001748 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1749 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001751#endif
Tim Peters772747b2001-08-09 22:21:55 +00001752 STORECHAR(ch);
1753 if (ch2)
1754 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001755 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001757#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758}
1759
1760PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1761{
1762 if (!PyUnicode_Check(unicode)) {
1763 PyErr_BadArgument();
1764 return NULL;
1765 }
1766 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1767 PyUnicode_GET_SIZE(unicode),
1768 NULL,
1769 0);
1770}
1771
1772/* --- Unicode Escape Codec ----------------------------------------------- */
1773
Fredrik Lundh06d12682001-01-24 07:59:11 +00001774static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001775
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001777 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778 const char *errors)
1779{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001780 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001781 Py_ssize_t startinpos;
1782 Py_ssize_t endinpos;
1783 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001784 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001786 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001788 char* message;
1789 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 PyObject *errorHandler = NULL;
1791 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001792
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 /* Escaped strings will always be longer than the resulting
1794 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001795 length after conversion to the true value.
1796 (but if the error callback returns a long replacement string
1797 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798 v = _PyUnicode_New(size);
1799 if (v == NULL)
1800 goto onError;
1801 if (size == 0)
1802 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001803
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001804 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001806
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807 while (s < end) {
1808 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001809 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811
1812 /* Non-escape characters are interpreted as Unicode ordinals */
1813 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001814 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 continue;
1816 }
1817
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819 /* \ - Escapes */
1820 s++;
1821 switch (*s++) {
1822
1823 /* \x escapes */
1824 case '\n': break;
1825 case '\\': *p++ = '\\'; break;
1826 case '\'': *p++ = '\''; break;
1827 case '\"': *p++ = '\"'; break;
1828 case 'b': *p++ = '\b'; break;
1829 case 'f': *p++ = '\014'; break; /* FF */
1830 case 't': *p++ = '\t'; break;
1831 case 'n': *p++ = '\n'; break;
1832 case 'r': *p++ = '\r'; break;
1833 case 'v': *p++ = '\013'; break; /* VT */
1834 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1835
1836 /* \OOO (octal) escapes */
1837 case '0': case '1': case '2': case '3':
1838 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001839 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001841 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001843 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001845 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 break;
1847
Fredrik Lundhccc74732001-02-18 22:13:49 +00001848 /* hex escapes */
1849 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001850 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001851 digits = 2;
1852 message = "truncated \\xXX escape";
1853 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854
Fredrik Lundhccc74732001-02-18 22:13:49 +00001855 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001857 digits = 4;
1858 message = "truncated \\uXXXX escape";
1859 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860
Fredrik Lundhccc74732001-02-18 22:13:49 +00001861 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001862 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001863 digits = 8;
1864 message = "truncated \\UXXXXXXXX escape";
1865 hexescape:
1866 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 outpos = p-PyUnicode_AS_UNICODE(v);
1868 if (s+digits>end) {
1869 endinpos = size;
1870 if (unicode_decode_call_errorhandler(
1871 errors, &errorHandler,
1872 "unicodeescape", "end of string in escape sequence",
1873 starts, size, &startinpos, &endinpos, &exc, &s,
1874 (PyObject **)&v, &outpos, &p))
1875 goto onError;
1876 goto nextByte;
1877 }
1878 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001879 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001880 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001881 endinpos = (s+i+1)-starts;
1882 if (unicode_decode_call_errorhandler(
1883 errors, &errorHandler,
1884 "unicodeescape", message,
1885 starts, size, &startinpos, &endinpos, &exc, &s,
1886 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001887 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001888 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001889 }
1890 chr = (chr<<4) & ~0xF;
1891 if (c >= '0' && c <= '9')
1892 chr += c - '0';
1893 else if (c >= 'a' && c <= 'f')
1894 chr += 10 + c - 'a';
1895 else
1896 chr += 10 + c - 'A';
1897 }
1898 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001899 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001900 /* _decoding_error will have already written into the
1901 target buffer. */
1902 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001903 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001904 /* when we get here, chr is a 32-bit unicode character */
1905 if (chr <= 0xffff)
1906 /* UCS-2 character */
1907 *p++ = (Py_UNICODE) chr;
1908 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001909 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001910 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001911#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912 *p++ = chr;
1913#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001914 chr -= 0x10000L;
1915 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001916 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001917#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001918 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001919 endinpos = s-starts;
1920 outpos = p-PyUnicode_AS_UNICODE(v);
1921 if (unicode_decode_call_errorhandler(
1922 errors, &errorHandler,
1923 "unicodeescape", "illegal Unicode character",
1924 starts, size, &startinpos, &endinpos, &exc, &s,
1925 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001926 goto onError;
1927 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001928 break;
1929
1930 /* \N{name} */
1931 case 'N':
1932 message = "malformed \\N character escape";
1933 if (ucnhash_CAPI == NULL) {
1934 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001935 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001936 m = PyImport_ImportModule("unicodedata");
1937 if (m == NULL)
1938 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001939 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001940 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001941 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001942 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001943 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001944 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001945 if (ucnhash_CAPI == NULL)
1946 goto ucnhashError;
1947 }
1948 if (*s == '{') {
1949 const char *start = s+1;
1950 /* look for the closing brace */
1951 while (*s != '}' && s < end)
1952 s++;
1953 if (s > start && s < end && *s == '}') {
1954 /* found a name. look it up in the unicode database */
1955 message = "unknown Unicode character name";
1956 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001957 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001958 goto store;
1959 }
1960 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001961 endinpos = s-starts;
1962 outpos = p-PyUnicode_AS_UNICODE(v);
1963 if (unicode_decode_call_errorhandler(
1964 errors, &errorHandler,
1965 "unicodeescape", message,
1966 starts, size, &startinpos, &endinpos, &exc, &s,
1967 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001968 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001969 break;
1970
1971 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001972 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001973 message = "\\ at end of string";
1974 s--;
1975 endinpos = s-starts;
1976 outpos = p-PyUnicode_AS_UNICODE(v);
1977 if (unicode_decode_call_errorhandler(
1978 errors, &errorHandler,
1979 "unicodeescape", message,
1980 starts, size, &startinpos, &endinpos, &exc, &s,
1981 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001982 goto onError;
1983 }
1984 else {
1985 *p++ = '\\';
1986 *p++ = (unsigned char)s[-1];
1987 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001988 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001990 nextByte:
1991 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001993 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001994 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001995 Py_XDECREF(errorHandler);
1996 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001998
Fredrik Lundhccc74732001-02-18 22:13:49 +00001999ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002000 PyErr_SetString(
2001 PyExc_UnicodeError,
2002 "\\N escapes not supported (can't load unicodedata module)"
2003 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002004 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002005 Py_XDECREF(errorHandler);
2006 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002007 return NULL;
2008
Fredrik Lundhccc74732001-02-18 22:13:49 +00002009onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002011 Py_XDECREF(errorHandler);
2012 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013 return NULL;
2014}
2015
2016/* Return a Unicode-Escape string version of the Unicode object.
2017
2018 If quotes is true, the string is enclosed in u"" or u'' quotes as
2019 appropriate.
2020
2021*/
2022
Barry Warsaw51ac5802000-03-20 16:36:48 +00002023static const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002024 Py_ssize_t size,
Barry Warsaw51ac5802000-03-20 16:36:48 +00002025 Py_UNICODE ch);
2026
Guido van Rossumd57fd912000-03-10 22:53:23 +00002027static
2028PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002029 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002030 int quotes)
2031{
2032 PyObject *repr;
2033 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002035 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036
2037 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
2038 if (repr == NULL)
2039 return NULL;
2040
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002041 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042
2043 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002045 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046 !findchar(s, size, '"')) ? '"' : '\'';
2047 }
2048 while (size-- > 0) {
2049 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002050
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002051 /* Escape quotes and backslashes */
2052 if ((quotes &&
2053 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 *p++ = '\\';
2055 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002056 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002057 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002058
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002059#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002060 /* Map 21-bit characters to '\U00xxxxxx' */
2061 else if (ch >= 0x10000) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00002062 Py_ssize_t offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002063
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002064 /* Resize the string if necessary */
2065 if (offset + 12 > PyString_GET_SIZE(repr)) {
2066 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002067 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002068 p = PyString_AS_STRING(repr) + offset;
2069 }
2070
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002071 *p++ = '\\';
2072 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002073 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2074 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2075 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2076 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2077 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2078 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2079 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002080 *p++ = hexdigit[ch & 0x0000000F];
2081 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002082 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002083#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002084 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2085 else if (ch >= 0xD800 && ch < 0xDC00) {
2086 Py_UNICODE ch2;
2087 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002088
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002089 ch2 = *s++;
2090 size--;
2091 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2092 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2093 *p++ = '\\';
2094 *p++ = 'U';
2095 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2096 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2097 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2098 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2099 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2100 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2101 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2102 *p++ = hexdigit[ucs & 0x0000000F];
2103 continue;
2104 }
2105 /* Fall through: isolated surrogates are copied as-is */
2106 s--;
2107 size++;
2108 }
2109
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002111 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002112 *p++ = '\\';
2113 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002114 *p++ = hexdigit[(ch >> 12) & 0x000F];
2115 *p++ = hexdigit[(ch >> 8) & 0x000F];
2116 *p++ = hexdigit[(ch >> 4) & 0x000F];
2117 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002119
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002120 /* Map special whitespace to '\t', \n', '\r' */
2121 else if (ch == '\t') {
2122 *p++ = '\\';
2123 *p++ = 't';
2124 }
2125 else if (ch == '\n') {
2126 *p++ = '\\';
2127 *p++ = 'n';
2128 }
2129 else if (ch == '\r') {
2130 *p++ = '\\';
2131 *p++ = 'r';
2132 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002133
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002134 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002135 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002137 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002138 *p++ = hexdigit[(ch >> 4) & 0x000F];
2139 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002140 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002141
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 /* Copy everything else as-is */
2143 else
2144 *p++ = (char) ch;
2145 }
2146 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002147 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148
2149 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002150 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002151 return repr;
2152}
2153
2154PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002155 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156{
2157 return unicodeescape_string(s, size, 0);
2158}
2159
2160PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2161{
2162 if (!PyUnicode_Check(unicode)) {
2163 PyErr_BadArgument();
2164 return NULL;
2165 }
2166 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2167 PyUnicode_GET_SIZE(unicode));
2168}
2169
2170/* --- Raw Unicode Escape Codec ------------------------------------------- */
2171
2172PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002173 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174 const char *errors)
2175{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002176 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002177 Py_ssize_t startinpos;
2178 Py_ssize_t endinpos;
2179 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002181 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 const char *end;
2183 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002184 PyObject *errorHandler = NULL;
2185 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002186
Guido van Rossumd57fd912000-03-10 22:53:23 +00002187 /* Escaped strings will always be longer than the resulting
2188 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002189 length after conversion to the true value. (But decoding error
2190 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191 v = _PyUnicode_New(size);
2192 if (v == NULL)
2193 goto onError;
2194 if (size == 0)
2195 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002196 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 end = s + size;
2198 while (s < end) {
2199 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002200 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002202 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203
2204 /* Non-escape characters are interpreted as Unicode ordinals */
2205 if (*s != '\\') {
2206 *p++ = (unsigned char)*s++;
2207 continue;
2208 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002209 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210
2211 /* \u-escapes are only interpreted iff the number of leading
2212 backslashes if odd */
2213 bs = s;
2214 for (;s < end;) {
2215 if (*s != '\\')
2216 break;
2217 *p++ = (unsigned char)*s++;
2218 }
2219 if (((s - bs) & 1) == 0 ||
2220 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002221 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 continue;
2223 }
2224 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002225 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226 s++;
2227
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002228 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002229 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002230 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002231 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002233 endinpos = s-starts;
2234 if (unicode_decode_call_errorhandler(
2235 errors, &errorHandler,
2236 "rawunicodeescape", "truncated \\uXXXX",
2237 starts, size, &startinpos, &endinpos, &exc, &s,
2238 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002240 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241 }
2242 x = (x<<4) & ~0xF;
2243 if (c >= '0' && c <= '9')
2244 x += c - '0';
2245 else if (c >= 'a' && c <= 'f')
2246 x += 10 + c - 'a';
2247 else
2248 x += 10 + c - 'A';
2249 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002250#ifndef Py_UNICODE_WIDE
2251 if (x > 0x10000) {
2252 if (unicode_decode_call_errorhandler(
2253 errors, &errorHandler,
2254 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2255 starts, size, &startinpos, &endinpos, &exc, &s,
2256 (PyObject **)&v, &outpos, &p))
2257 goto onError;
2258 }
2259#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002260 *p++ = x;
2261 nextByte:
2262 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002264 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002265 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266 Py_XDECREF(errorHandler);
2267 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002269
Guido van Rossumd57fd912000-03-10 22:53:23 +00002270 onError:
2271 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002272 Py_XDECREF(errorHandler);
2273 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 return NULL;
2275}
2276
2277PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002278 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279{
2280 PyObject *repr;
2281 char *p;
2282 char *q;
2283
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002284 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002286#ifdef Py_UNICODE_WIDE
2287 repr = PyString_FromStringAndSize(NULL, 10 * size);
2288#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002290#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 if (repr == NULL)
2292 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002293 if (size == 0)
2294 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295
2296 p = q = PyString_AS_STRING(repr);
2297 while (size-- > 0) {
2298 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002299#ifdef Py_UNICODE_WIDE
2300 /* Map 32-bit characters to '\Uxxxxxxxx' */
2301 if (ch >= 0x10000) {
2302 *p++ = '\\';
2303 *p++ = 'U';
2304 *p++ = hexdigit[(ch >> 28) & 0xf];
2305 *p++ = hexdigit[(ch >> 24) & 0xf];
2306 *p++ = hexdigit[(ch >> 20) & 0xf];
2307 *p++ = hexdigit[(ch >> 16) & 0xf];
2308 *p++ = hexdigit[(ch >> 12) & 0xf];
2309 *p++ = hexdigit[(ch >> 8) & 0xf];
2310 *p++ = hexdigit[(ch >> 4) & 0xf];
2311 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002312 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002313 else
2314#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002315 /* Map 16-bit characters to '\uxxxx' */
2316 if (ch >= 256) {
2317 *p++ = '\\';
2318 *p++ = 'u';
2319 *p++ = hexdigit[(ch >> 12) & 0xf];
2320 *p++ = hexdigit[(ch >> 8) & 0xf];
2321 *p++ = hexdigit[(ch >> 4) & 0xf];
2322 *p++ = hexdigit[ch & 15];
2323 }
2324 /* Copy everything else as-is */
2325 else
2326 *p++ = (char) ch;
2327 }
2328 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002329 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330 return repr;
2331}
2332
2333PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2334{
2335 if (!PyUnicode_Check(unicode)) {
2336 PyErr_BadArgument();
2337 return NULL;
2338 }
2339 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2340 PyUnicode_GET_SIZE(unicode));
2341}
2342
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002343/* --- Unicode Internal Codec ------------------------------------------- */
2344
2345PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002346 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002347 const char *errors)
2348{
2349 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002350 Py_ssize_t startinpos;
2351 Py_ssize_t endinpos;
2352 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002353 PyUnicodeObject *v;
2354 Py_UNICODE *p;
2355 const char *end;
2356 const char *reason;
2357 PyObject *errorHandler = NULL;
2358 PyObject *exc = NULL;
2359
Neal Norwitzd43069c2006-01-08 01:12:10 +00002360#ifdef Py_UNICODE_WIDE
2361 Py_UNICODE unimax = PyUnicode_GetMax();
2362#endif
2363
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002364 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2365 if (v == NULL)
2366 goto onError;
2367 if (PyUnicode_GetSize((PyObject *)v) == 0)
2368 return (PyObject *)v;
2369 p = PyUnicode_AS_UNICODE(v);
2370 end = s + size;
2371
2372 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002373 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002374 /* We have to sanity check the raw data, otherwise doom looms for
2375 some malformed UCS-4 data. */
2376 if (
2377 #ifdef Py_UNICODE_WIDE
2378 *p > unimax || *p < 0 ||
2379 #endif
2380 end-s < Py_UNICODE_SIZE
2381 )
2382 {
2383 startinpos = s - starts;
2384 if (end-s < Py_UNICODE_SIZE) {
2385 endinpos = end-starts;
2386 reason = "truncated input";
2387 }
2388 else {
2389 endinpos = s - starts + Py_UNICODE_SIZE;
2390 reason = "illegal code point (> 0x10FFFF)";
2391 }
2392 outpos = p - PyUnicode_AS_UNICODE(v);
2393 if (unicode_decode_call_errorhandler(
2394 errors, &errorHandler,
2395 "unicode_internal", reason,
2396 starts, size, &startinpos, &endinpos, &exc, &s,
2397 (PyObject **)&v, &outpos, &p)) {
2398 goto onError;
2399 }
2400 }
2401 else {
2402 p++;
2403 s += Py_UNICODE_SIZE;
2404 }
2405 }
2406
Martin v. Löwis412fb672006-04-13 06:34:32 +00002407 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002408 goto onError;
2409 Py_XDECREF(errorHandler);
2410 Py_XDECREF(exc);
2411 return (PyObject *)v;
2412
2413 onError:
2414 Py_XDECREF(v);
2415 Py_XDECREF(errorHandler);
2416 Py_XDECREF(exc);
2417 return NULL;
2418}
2419
Guido van Rossumd57fd912000-03-10 22:53:23 +00002420/* --- Latin-1 Codec ------------------------------------------------------ */
2421
2422PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002423 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002424 const char *errors)
2425{
2426 PyUnicodeObject *v;
2427 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002428
Guido van Rossumd57fd912000-03-10 22:53:23 +00002429 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002430 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002431 Py_UNICODE r = *(unsigned char*)s;
2432 return PyUnicode_FromUnicode(&r, 1);
2433 }
2434
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435 v = _PyUnicode_New(size);
2436 if (v == NULL)
2437 goto onError;
2438 if (size == 0)
2439 return (PyObject *)v;
2440 p = PyUnicode_AS_UNICODE(v);
2441 while (size-- > 0)
2442 *p++ = (unsigned char)*s++;
2443 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002444
Guido van Rossumd57fd912000-03-10 22:53:23 +00002445 onError:
2446 Py_XDECREF(v);
2447 return NULL;
2448}
2449
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002450/* create or adjust a UnicodeEncodeError */
2451static void make_encode_exception(PyObject **exceptionObject,
2452 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002453 const Py_UNICODE *unicode, Py_ssize_t size,
2454 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002455 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002456{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002457 if (*exceptionObject == NULL) {
2458 *exceptionObject = PyUnicodeEncodeError_Create(
2459 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460 }
2461 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002462 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2463 goto onError;
2464 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2465 goto onError;
2466 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2467 goto onError;
2468 return;
2469 onError:
2470 Py_DECREF(*exceptionObject);
2471 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002472 }
2473}
2474
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002475/* raises a UnicodeEncodeError */
2476static void raise_encode_exception(PyObject **exceptionObject,
2477 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002478 const Py_UNICODE *unicode, Py_ssize_t size,
2479 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002480 const char *reason)
2481{
2482 make_encode_exception(exceptionObject,
2483 encoding, unicode, size, startpos, endpos, reason);
2484 if (*exceptionObject != NULL)
2485 PyCodec_StrictErrors(*exceptionObject);
2486}
2487
2488/* error handling callback helper:
2489 build arguments, call the callback and check the arguments,
2490 put the result into newpos and return the replacement string, which
2491 has to be freed by the caller */
2492static PyObject *unicode_encode_call_errorhandler(const char *errors,
2493 PyObject **errorHandler,
2494 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002495 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2496 Py_ssize_t startpos, Py_ssize_t endpos,
2497 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002498{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002499 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002500
2501 PyObject *restuple;
2502 PyObject *resunicode;
2503
2504 if (*errorHandler == NULL) {
2505 *errorHandler = PyCodec_LookupError(errors);
2506 if (*errorHandler == NULL)
2507 return NULL;
2508 }
2509
2510 make_encode_exception(exceptionObject,
2511 encoding, unicode, size, startpos, endpos, reason);
2512 if (*exceptionObject == NULL)
2513 return NULL;
2514
2515 restuple = PyObject_CallFunctionObjArgs(
2516 *errorHandler, *exceptionObject, NULL);
2517 if (restuple == NULL)
2518 return NULL;
2519 if (!PyTuple_Check(restuple)) {
2520 PyErr_Format(PyExc_TypeError, &argparse[4]);
2521 Py_DECREF(restuple);
2522 return NULL;
2523 }
2524 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2525 &resunicode, newpos)) {
2526 Py_DECREF(restuple);
2527 return NULL;
2528 }
2529 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002530 *newpos = size+*newpos;
2531 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002532 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002533 Py_DECREF(restuple);
2534 return NULL;
2535 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002536 Py_INCREF(resunicode);
2537 Py_DECREF(restuple);
2538 return resunicode;
2539}
2540
2541static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002542 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002543 const char *errors,
2544 int limit)
2545{
2546 /* output object */
2547 PyObject *res;
2548 /* pointers to the beginning and end+1 of input */
2549 const Py_UNICODE *startp = p;
2550 const Py_UNICODE *endp = p + size;
2551 /* pointer to the beginning of the unencodable characters */
2552 /* const Py_UNICODE *badp = NULL; */
2553 /* pointer into the output */
2554 char *str;
2555 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002556 Py_ssize_t respos = 0;
2557 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002558 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2559 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002560 PyObject *errorHandler = NULL;
2561 PyObject *exc = NULL;
2562 /* the following variable is used for caching string comparisons
2563 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2564 int known_errorHandler = -1;
2565
2566 /* allocate enough for a simple encoding without
2567 replacements, if we need more, we'll resize */
2568 res = PyString_FromStringAndSize(NULL, size);
2569 if (res == NULL)
2570 goto onError;
2571 if (size == 0)
2572 return res;
2573 str = PyString_AS_STRING(res);
2574 ressize = size;
2575
2576 while (p<endp) {
2577 Py_UNICODE c = *p;
2578
2579 /* can we encode this? */
2580 if (c<limit) {
2581 /* no overflow check, because we know that the space is enough */
2582 *str++ = (char)c;
2583 ++p;
2584 }
2585 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002586 Py_ssize_t unicodepos = p-startp;
2587 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002588 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002589 Py_ssize_t repsize;
2590 Py_ssize_t newpos;
2591 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002592 Py_UNICODE *uni2;
2593 /* startpos for collecting unencodable chars */
2594 const Py_UNICODE *collstart = p;
2595 const Py_UNICODE *collend = p;
2596 /* find all unecodable characters */
2597 while ((collend < endp) && ((*collend)>=limit))
2598 ++collend;
2599 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2600 if (known_errorHandler==-1) {
2601 if ((errors==NULL) || (!strcmp(errors, "strict")))
2602 known_errorHandler = 1;
2603 else if (!strcmp(errors, "replace"))
2604 known_errorHandler = 2;
2605 else if (!strcmp(errors, "ignore"))
2606 known_errorHandler = 3;
2607 else if (!strcmp(errors, "xmlcharrefreplace"))
2608 known_errorHandler = 4;
2609 else
2610 known_errorHandler = 0;
2611 }
2612 switch (known_errorHandler) {
2613 case 1: /* strict */
2614 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2615 goto onError;
2616 case 2: /* replace */
2617 while (collstart++<collend)
2618 *str++ = '?'; /* fall through */
2619 case 3: /* ignore */
2620 p = collend;
2621 break;
2622 case 4: /* xmlcharrefreplace */
2623 respos = str-PyString_AS_STRING(res);
2624 /* determine replacement size (temporarily (mis)uses p) */
2625 for (p = collstart, repsize = 0; p < collend; ++p) {
2626 if (*p<10)
2627 repsize += 2+1+1;
2628 else if (*p<100)
2629 repsize += 2+2+1;
2630 else if (*p<1000)
2631 repsize += 2+3+1;
2632 else if (*p<10000)
2633 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002634#ifndef Py_UNICODE_WIDE
2635 else
2636 repsize += 2+5+1;
2637#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002638 else if (*p<100000)
2639 repsize += 2+5+1;
2640 else if (*p<1000000)
2641 repsize += 2+6+1;
2642 else
2643 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002644#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002645 }
2646 requiredsize = respos+repsize+(endp-collend);
2647 if (requiredsize > ressize) {
2648 if (requiredsize<2*ressize)
2649 requiredsize = 2*ressize;
2650 if (_PyString_Resize(&res, requiredsize))
2651 goto onError;
2652 str = PyString_AS_STRING(res) + respos;
2653 ressize = requiredsize;
2654 }
2655 /* generate replacement (temporarily (mis)uses p) */
2656 for (p = collstart; p < collend; ++p) {
2657 str += sprintf(str, "&#%d;", (int)*p);
2658 }
2659 p = collend;
2660 break;
2661 default:
2662 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2663 encoding, reason, startp, size, &exc,
2664 collstart-startp, collend-startp, &newpos);
2665 if (repunicode == NULL)
2666 goto onError;
2667 /* need more space? (at least enough for what we
2668 have+the replacement+the rest of the string, so
2669 we won't have to check space for encodable characters) */
2670 respos = str-PyString_AS_STRING(res);
2671 repsize = PyUnicode_GET_SIZE(repunicode);
2672 requiredsize = respos+repsize+(endp-collend);
2673 if (requiredsize > ressize) {
2674 if (requiredsize<2*ressize)
2675 requiredsize = 2*ressize;
2676 if (_PyString_Resize(&res, requiredsize)) {
2677 Py_DECREF(repunicode);
2678 goto onError;
2679 }
2680 str = PyString_AS_STRING(res) + respos;
2681 ressize = requiredsize;
2682 }
2683 /* check if there is anything unencodable in the replacement
2684 and copy it to the output */
2685 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2686 c = *uni2;
2687 if (c >= limit) {
2688 raise_encode_exception(&exc, encoding, startp, size,
2689 unicodepos, unicodepos+1, reason);
2690 Py_DECREF(repunicode);
2691 goto onError;
2692 }
2693 *str = (char)c;
2694 }
2695 p = startp + newpos;
2696 Py_DECREF(repunicode);
2697 }
2698 }
2699 }
2700 /* Resize if we allocated to much */
2701 respos = str-PyString_AS_STRING(res);
2702 if (respos<ressize)
2703 /* If this falls res will be NULL */
2704 _PyString_Resize(&res, respos);
2705 Py_XDECREF(errorHandler);
2706 Py_XDECREF(exc);
2707 return res;
2708
2709 onError:
2710 Py_XDECREF(res);
2711 Py_XDECREF(errorHandler);
2712 Py_XDECREF(exc);
2713 return NULL;
2714}
2715
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002717 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718 const char *errors)
2719{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002720 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721}
2722
2723PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2724{
2725 if (!PyUnicode_Check(unicode)) {
2726 PyErr_BadArgument();
2727 return NULL;
2728 }
2729 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2730 PyUnicode_GET_SIZE(unicode),
2731 NULL);
2732}
2733
2734/* --- 7-bit ASCII Codec -------------------------------------------------- */
2735
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002737 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 const char *errors)
2739{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002740 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 PyUnicodeObject *v;
2742 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002743 Py_ssize_t startinpos;
2744 Py_ssize_t endinpos;
2745 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 const char *e;
2747 PyObject *errorHandler = NULL;
2748 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002749
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002751 if (size == 1 && *(unsigned char*)s < 128) {
2752 Py_UNICODE r = *(unsigned char*)s;
2753 return PyUnicode_FromUnicode(&r, 1);
2754 }
Tim Petersced69f82003-09-16 20:30:58 +00002755
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 v = _PyUnicode_New(size);
2757 if (v == NULL)
2758 goto onError;
2759 if (size == 0)
2760 return (PyObject *)v;
2761 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002762 e = s + size;
2763 while (s < e) {
2764 register unsigned char c = (unsigned char)*s;
2765 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002767 ++s;
2768 }
2769 else {
2770 startinpos = s-starts;
2771 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002772 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002773 if (unicode_decode_call_errorhandler(
2774 errors, &errorHandler,
2775 "ascii", "ordinal not in range(128)",
2776 starts, size, &startinpos, &endinpos, &exc, &s,
2777 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002781 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002782 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002783 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002784 Py_XDECREF(errorHandler);
2785 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002787
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788 onError:
2789 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002790 Py_XDECREF(errorHandler);
2791 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 return NULL;
2793}
2794
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002796 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797 const char *errors)
2798{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002799 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800}
2801
2802PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2803{
2804 if (!PyUnicode_Check(unicode)) {
2805 PyErr_BadArgument();
2806 return NULL;
2807 }
2808 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2809 PyUnicode_GET_SIZE(unicode),
2810 NULL);
2811}
2812
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002813#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002814
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002815/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002816
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002817PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002818 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002819 const char *errors)
2820{
2821 PyUnicodeObject *v;
2822 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002823 DWORD usize;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002824
2825 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002826 assert(size < INT_MAX);
2827 usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002828 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002829 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2830
2831 v = _PyUnicode_New(usize);
2832 if (v == NULL)
2833 return NULL;
2834 if (usize == 0)
2835 return (PyObject *)v;
2836 p = PyUnicode_AS_UNICODE(v);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002837 if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002838 Py_DECREF(v);
2839 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2840 }
2841
2842 return (PyObject *)v;
2843}
2844
2845PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002846 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002847 const char *errors)
2848{
2849 PyObject *repr;
2850 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002851 DWORD mbcssize;
2852
2853 /* If there are no characters, bail now! */
2854 if (size==0)
2855 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002856
2857 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002858 assert(size<INT_MAX);
2859 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002860 if (mbcssize==0)
2861 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2862
2863 repr = PyString_FromStringAndSize(NULL, mbcssize);
2864 if (repr == NULL)
2865 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002866 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002867 return repr;
2868
2869 /* Do the conversion */
2870 s = PyString_AS_STRING(repr);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002871 assert(size < INT_MAX);
2872 if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002873 Py_DECREF(repr);
2874 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2875 }
2876 return repr;
2877}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002878
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002879PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2880{
2881 if (!PyUnicode_Check(unicode)) {
2882 PyErr_BadArgument();
2883 return NULL;
2884 }
2885 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2886 PyUnicode_GET_SIZE(unicode),
2887 NULL);
2888}
2889
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002890#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002891
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892/* --- Character Mapping Codec -------------------------------------------- */
2893
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002895 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002896 PyObject *mapping,
2897 const char *errors)
2898{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002899 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002900 Py_ssize_t startinpos;
2901 Py_ssize_t endinpos;
2902 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002903 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904 PyUnicodeObject *v;
2905 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002906 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002907 PyObject *errorHandler = NULL;
2908 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002909 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002910 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002911
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912 /* Default to Latin-1 */
2913 if (mapping == NULL)
2914 return PyUnicode_DecodeLatin1(s, size, errors);
2915
2916 v = _PyUnicode_New(size);
2917 if (v == NULL)
2918 goto onError;
2919 if (size == 0)
2920 return (PyObject *)v;
2921 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002922 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002923 if (PyUnicode_CheckExact(mapping)) {
2924 mapstring = PyUnicode_AS_UNICODE(mapping);
2925 maplen = PyUnicode_GET_SIZE(mapping);
2926 while (s < e) {
2927 unsigned char ch = *s;
2928 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002929
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002930 if (ch < maplen)
2931 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002933 if (x == 0xfffe) {
2934 /* undefined mapping */
2935 outpos = p-PyUnicode_AS_UNICODE(v);
2936 startinpos = s-starts;
2937 endinpos = startinpos+1;
2938 if (unicode_decode_call_errorhandler(
2939 errors, &errorHandler,
2940 "charmap", "character maps to <undefined>",
2941 starts, size, &startinpos, &endinpos, &exc, &s,
2942 (PyObject **)&v, &outpos, &p)) {
2943 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002944 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002945 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002946 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002947 *p++ = x;
2948 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002950 }
2951 else {
2952 while (s < e) {
2953 unsigned char ch = *s;
2954 PyObject *w, *x;
2955
2956 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2957 w = PyInt_FromLong((long)ch);
2958 if (w == NULL)
2959 goto onError;
2960 x = PyObject_GetItem(mapping, w);
2961 Py_DECREF(w);
2962 if (x == NULL) {
2963 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2964 /* No mapping found means: mapping is undefined. */
2965 PyErr_Clear();
2966 x = Py_None;
2967 Py_INCREF(x);
2968 } else
2969 goto onError;
2970 }
2971
2972 /* Apply mapping */
2973 if (PyInt_Check(x)) {
2974 long value = PyInt_AS_LONG(x);
2975 if (value < 0 || value > 65535) {
2976 PyErr_SetString(PyExc_TypeError,
2977 "character mapping must be in range(65536)");
2978 Py_DECREF(x);
2979 goto onError;
2980 }
2981 *p++ = (Py_UNICODE)value;
2982 }
2983 else if (x == Py_None) {
2984 /* undefined mapping */
2985 outpos = p-PyUnicode_AS_UNICODE(v);
2986 startinpos = s-starts;
2987 endinpos = startinpos+1;
2988 if (unicode_decode_call_errorhandler(
2989 errors, &errorHandler,
2990 "charmap", "character maps to <undefined>",
2991 starts, size, &startinpos, &endinpos, &exc, &s,
2992 (PyObject **)&v, &outpos, &p)) {
2993 Py_DECREF(x);
2994 goto onError;
2995 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00002996 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002997 continue;
2998 }
2999 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003000 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003001
3002 if (targetsize == 1)
3003 /* 1-1 mapping */
3004 *p++ = *PyUnicode_AS_UNICODE(x);
3005
3006 else if (targetsize > 1) {
3007 /* 1-n mapping */
3008 if (targetsize > extrachars) {
3009 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003010 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3011 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003012 (targetsize << 2);
3013 extrachars += needed;
3014 if (_PyUnicode_Resize(&v,
3015 PyUnicode_GET_SIZE(v) + needed) < 0) {
3016 Py_DECREF(x);
3017 goto onError;
3018 }
3019 p = PyUnicode_AS_UNICODE(v) + oldpos;
3020 }
3021 Py_UNICODE_COPY(p,
3022 PyUnicode_AS_UNICODE(x),
3023 targetsize);
3024 p += targetsize;
3025 extrachars -= targetsize;
3026 }
3027 /* 1-0 mapping: skip the character */
3028 }
3029 else {
3030 /* wrong return value */
3031 PyErr_SetString(PyExc_TypeError,
3032 "character mapping must return integer, None or unicode");
3033 Py_DECREF(x);
3034 goto onError;
3035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003037 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 }
3040 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003041 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003043 Py_XDECREF(errorHandler);
3044 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003046
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048 Py_XDECREF(errorHandler);
3049 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 Py_XDECREF(v);
3051 return NULL;
3052}
3053
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003054/* Lookup the character ch in the mapping. If the character
3055 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003056 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003057static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 PyObject *w = PyInt_FromLong((long)c);
3060 PyObject *x;
3061
3062 if (w == NULL)
3063 return NULL;
3064 x = PyObject_GetItem(mapping, w);
3065 Py_DECREF(w);
3066 if (x == NULL) {
3067 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3068 /* No mapping found means: mapping is undefined. */
3069 PyErr_Clear();
3070 x = Py_None;
3071 Py_INCREF(x);
3072 return x;
3073 } else
3074 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003076 else if (x == Py_None)
3077 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003078 else if (PyInt_Check(x)) {
3079 long value = PyInt_AS_LONG(x);
3080 if (value < 0 || value > 255) {
3081 PyErr_SetString(PyExc_TypeError,
3082 "character mapping must be in range(256)");
3083 Py_DECREF(x);
3084 return NULL;
3085 }
3086 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003088 else if (PyString_Check(x))
3089 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003091 /* wrong return value */
3092 PyErr_SetString(PyExc_TypeError,
3093 "character mapping must return integer, None or str");
3094 Py_DECREF(x);
3095 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 }
3097}
3098
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003099/* lookup the character, put the result in the output string and adjust
3100 various state variables. Reallocate the output string if not enough
3101 space is available. Return a new reference to the object that
3102 was put in the output buffer, or Py_None, if the mapping was undefined
3103 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003104 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003105static
3106PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003107 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003108{
3109 PyObject *rep = charmapencode_lookup(c, mapping);
3110
3111 if (rep==NULL)
3112 return NULL;
3113 else if (rep==Py_None)
3114 return rep;
3115 else {
3116 char *outstart = PyString_AS_STRING(*outobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003117 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003118 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003119 Py_ssize_t requiredsize = *outpos+1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003120 if (outsize<requiredsize) {
3121 /* exponentially overallocate to minimize reallocations */
3122 if (requiredsize < 2*outsize)
3123 requiredsize = 2*outsize;
3124 if (_PyString_Resize(outobj, requiredsize)) {
3125 Py_DECREF(rep);
3126 return NULL;
3127 }
3128 outstart = PyString_AS_STRING(*outobj);
3129 }
3130 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3131 }
3132 else {
3133 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003134 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3135 Py_ssize_t requiredsize = *outpos+repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003136 if (outsize<requiredsize) {
3137 /* exponentially overallocate to minimize reallocations */
3138 if (requiredsize < 2*outsize)
3139 requiredsize = 2*outsize;
3140 if (_PyString_Resize(outobj, requiredsize)) {
3141 Py_DECREF(rep);
3142 return NULL;
3143 }
3144 outstart = PyString_AS_STRING(*outobj);
3145 }
3146 memcpy(outstart + *outpos, repchars, repsize);
3147 *outpos += repsize;
3148 }
3149 }
3150 return rep;
3151}
3152
3153/* handle an error in PyUnicode_EncodeCharmap
3154 Return 0 on success, -1 on error */
3155static
3156int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003157 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003158 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003159 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003160 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003161{
3162 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003163 Py_ssize_t repsize;
3164 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003165 Py_UNICODE *uni2;
3166 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003167 Py_ssize_t collstartpos = *inpos;
3168 Py_ssize_t collendpos = *inpos+1;
3169 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003170 char *encoding = "charmap";
3171 char *reason = "character maps to <undefined>";
3172
3173 PyObject *x;
3174 /* find all unencodable characters */
3175 while (collendpos < size) {
3176 x = charmapencode_lookup(p[collendpos], mapping);
3177 if (x==NULL)
3178 return -1;
3179 else if (x!=Py_None) {
3180 Py_DECREF(x);
3181 break;
3182 }
3183 Py_DECREF(x);
3184 ++collendpos;
3185 }
3186 /* cache callback name lookup
3187 * (if not done yet, i.e. it's the first error) */
3188 if (*known_errorHandler==-1) {
3189 if ((errors==NULL) || (!strcmp(errors, "strict")))
3190 *known_errorHandler = 1;
3191 else if (!strcmp(errors, "replace"))
3192 *known_errorHandler = 2;
3193 else if (!strcmp(errors, "ignore"))
3194 *known_errorHandler = 3;
3195 else if (!strcmp(errors, "xmlcharrefreplace"))
3196 *known_errorHandler = 4;
3197 else
3198 *known_errorHandler = 0;
3199 }
3200 switch (*known_errorHandler) {
3201 case 1: /* strict */
3202 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3203 return -1;
3204 case 2: /* replace */
3205 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3206 x = charmapencode_output('?', mapping, res, respos);
3207 if (x==NULL) {
3208 return -1;
3209 }
3210 else if (x==Py_None) {
3211 Py_DECREF(x);
3212 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3213 return -1;
3214 }
3215 Py_DECREF(x);
3216 }
3217 /* fall through */
3218 case 3: /* ignore */
3219 *inpos = collendpos;
3220 break;
3221 case 4: /* xmlcharrefreplace */
3222 /* generate replacement (temporarily (mis)uses p) */
3223 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3224 char buffer[2+29+1+1];
3225 char *cp;
3226 sprintf(buffer, "&#%d;", (int)p[collpos]);
3227 for (cp = buffer; *cp; ++cp) {
3228 x = charmapencode_output(*cp, mapping, res, respos);
3229 if (x==NULL)
3230 return -1;
3231 else if (x==Py_None) {
3232 Py_DECREF(x);
3233 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3234 return -1;
3235 }
3236 Py_DECREF(x);
3237 }
3238 }
3239 *inpos = collendpos;
3240 break;
3241 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003242 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003243 encoding, reason, p, size, exceptionObject,
3244 collstartpos, collendpos, &newpos);
3245 if (repunicode == NULL)
3246 return -1;
3247 /* generate replacement */
3248 repsize = PyUnicode_GET_SIZE(repunicode);
3249 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3250 x = charmapencode_output(*uni2, mapping, res, respos);
3251 if (x==NULL) {
3252 Py_DECREF(repunicode);
3253 return -1;
3254 }
3255 else if (x==Py_None) {
3256 Py_DECREF(repunicode);
3257 Py_DECREF(x);
3258 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3259 return -1;
3260 }
3261 Py_DECREF(x);
3262 }
3263 *inpos = newpos;
3264 Py_DECREF(repunicode);
3265 }
3266 return 0;
3267}
3268
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003270 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 PyObject *mapping,
3272 const char *errors)
3273{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003274 /* output object */
3275 PyObject *res = NULL;
3276 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003277 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003278 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003279 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003280 PyObject *errorHandler = NULL;
3281 PyObject *exc = NULL;
3282 /* the following variable is used for caching string comparisons
3283 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3284 * 3=ignore, 4=xmlcharrefreplace */
3285 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286
3287 /* Default to Latin-1 */
3288 if (mapping == NULL)
3289 return PyUnicode_EncodeLatin1(p, size, errors);
3290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003291 /* allocate enough for a simple encoding without
3292 replacements, if we need more, we'll resize */
3293 res = PyString_FromStringAndSize(NULL, size);
3294 if (res == NULL)
3295 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003296 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003297 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003299 while (inpos<size) {
3300 /* try to encode it */
3301 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3302 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003304 if (x==Py_None) { /* unencodable character */
3305 if (charmap_encoding_error(p, size, &inpos, mapping,
3306 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003307 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003308 &res, &respos)) {
3309 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003310 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003311 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003313 else
3314 /* done with this character => adjust input position */
3315 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316 Py_DECREF(x);
3317 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003319 /* Resize if we allocated to much */
3320 if (respos<PyString_GET_SIZE(res)) {
3321 if (_PyString_Resize(&res, respos))
3322 goto onError;
3323 }
3324 Py_XDECREF(exc);
3325 Py_XDECREF(errorHandler);
3326 return res;
3327
3328 onError:
3329 Py_XDECREF(res);
3330 Py_XDECREF(exc);
3331 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 return NULL;
3333}
3334
3335PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3336 PyObject *mapping)
3337{
3338 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3339 PyErr_BadArgument();
3340 return NULL;
3341 }
3342 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3343 PyUnicode_GET_SIZE(unicode),
3344 mapping,
3345 NULL);
3346}
3347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003348/* create or adjust a UnicodeTranslateError */
3349static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003350 const Py_UNICODE *unicode, Py_ssize_t size,
3351 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003352 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003354 if (*exceptionObject == NULL) {
3355 *exceptionObject = PyUnicodeTranslateError_Create(
3356 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357 }
3358 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003359 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3360 goto onError;
3361 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3362 goto onError;
3363 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3364 goto onError;
3365 return;
3366 onError:
3367 Py_DECREF(*exceptionObject);
3368 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369 }
3370}
3371
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003372/* raises a UnicodeTranslateError */
3373static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003374 const Py_UNICODE *unicode, Py_ssize_t size,
3375 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003376 const char *reason)
3377{
3378 make_translate_exception(exceptionObject,
3379 unicode, size, startpos, endpos, reason);
3380 if (*exceptionObject != NULL)
3381 PyCodec_StrictErrors(*exceptionObject);
3382}
3383
3384/* error handling callback helper:
3385 build arguments, call the callback and check the arguments,
3386 put the result into newpos and return the replacement string, which
3387 has to be freed by the caller */
3388static PyObject *unicode_translate_call_errorhandler(const char *errors,
3389 PyObject **errorHandler,
3390 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003391 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3392 Py_ssize_t startpos, Py_ssize_t endpos,
3393 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003394{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003395 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003396
Martin v. Löwis412fb672006-04-13 06:34:32 +00003397 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003398 PyObject *restuple;
3399 PyObject *resunicode;
3400
3401 if (*errorHandler == NULL) {
3402 *errorHandler = PyCodec_LookupError(errors);
3403 if (*errorHandler == NULL)
3404 return NULL;
3405 }
3406
3407 make_translate_exception(exceptionObject,
3408 unicode, size, startpos, endpos, reason);
3409 if (*exceptionObject == NULL)
3410 return NULL;
3411
3412 restuple = PyObject_CallFunctionObjArgs(
3413 *errorHandler, *exceptionObject, NULL);
3414 if (restuple == NULL)
3415 return NULL;
3416 if (!PyTuple_Check(restuple)) {
3417 PyErr_Format(PyExc_TypeError, &argparse[4]);
3418 Py_DECREF(restuple);
3419 return NULL;
3420 }
3421 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003422 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003423 Py_DECREF(restuple);
3424 return NULL;
3425 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003426 if (i_newpos<0)
3427 *newpos = size+i_newpos;
3428 else
3429 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003430 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003431 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003432 Py_DECREF(restuple);
3433 return NULL;
3434 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003435 Py_INCREF(resunicode);
3436 Py_DECREF(restuple);
3437 return resunicode;
3438}
3439
3440/* Lookup the character ch in the mapping and put the result in result,
3441 which must be decrefed by the caller.
3442 Return 0 on success, -1 on error */
3443static
3444int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3445{
3446 PyObject *w = PyInt_FromLong((long)c);
3447 PyObject *x;
3448
3449 if (w == NULL)
3450 return -1;
3451 x = PyObject_GetItem(mapping, w);
3452 Py_DECREF(w);
3453 if (x == NULL) {
3454 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3455 /* No mapping found means: use 1:1 mapping. */
3456 PyErr_Clear();
3457 *result = NULL;
3458 return 0;
3459 } else
3460 return -1;
3461 }
3462 else if (x == Py_None) {
3463 *result = x;
3464 return 0;
3465 }
3466 else if (PyInt_Check(x)) {
3467 long value = PyInt_AS_LONG(x);
3468 long max = PyUnicode_GetMax();
3469 if (value < 0 || value > max) {
3470 PyErr_Format(PyExc_TypeError,
3471 "character mapping must be in range(0x%lx)", max+1);
3472 Py_DECREF(x);
3473 return -1;
3474 }
3475 *result = x;
3476 return 0;
3477 }
3478 else if (PyUnicode_Check(x)) {
3479 *result = x;
3480 return 0;
3481 }
3482 else {
3483 /* wrong return value */
3484 PyErr_SetString(PyExc_TypeError,
3485 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003486 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487 return -1;
3488 }
3489}
3490/* ensure that *outobj is at least requiredsize characters long,
3491if not reallocate and adjust various state variables.
3492Return 0 on success, -1 on error */
3493static
Walter Dörwald4894c302003-10-24 14:25:28 +00003494int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003495 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003497 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003498 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003499 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003500 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003501 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003502 if (requiredsize < 2 * oldsize)
3503 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003504 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505 return -1;
3506 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507 }
3508 return 0;
3509}
3510/* lookup the character, put the result in the output string and adjust
3511 various state variables. Return a new reference to the object that
3512 was put in the output buffer in *result, or Py_None, if the mapping was
3513 undefined (in which case no character was written).
3514 The called must decref result.
3515 Return 0 on success, -1 on error. */
3516static
Walter Dörwald4894c302003-10-24 14:25:28 +00003517int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003518 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003519 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520{
Walter Dörwald4894c302003-10-24 14:25:28 +00003521 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 return -1;
3523 if (*res==NULL) {
3524 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003525 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526 }
3527 else if (*res==Py_None)
3528 ;
3529 else if (PyInt_Check(*res)) {
3530 /* no overflow check, because we know that the space is enough */
3531 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3532 }
3533 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003534 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 if (repsize==1) {
3536 /* no overflow check, because we know that the space is enough */
3537 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3538 }
3539 else if (repsize!=0) {
3540 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003541 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003542 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003543 repsize - 1;
3544 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 return -1;
3546 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3547 *outp += repsize;
3548 }
3549 }
3550 else
3551 return -1;
3552 return 0;
3553}
3554
3555PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003556 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003557 PyObject *mapping,
3558 const char *errors)
3559{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 /* output object */
3561 PyObject *res = NULL;
3562 /* pointers to the beginning and end+1 of input */
3563 const Py_UNICODE *startp = p;
3564 const Py_UNICODE *endp = p + size;
3565 /* pointer into the output */
3566 Py_UNICODE *str;
3567 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003568 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569 char *reason = "character maps to <undefined>";
3570 PyObject *errorHandler = NULL;
3571 PyObject *exc = NULL;
3572 /* the following variable is used for caching string comparisons
3573 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3574 * 3=ignore, 4=xmlcharrefreplace */
3575 int known_errorHandler = -1;
3576
Guido van Rossumd57fd912000-03-10 22:53:23 +00003577 if (mapping == NULL) {
3578 PyErr_BadArgument();
3579 return NULL;
3580 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581
3582 /* allocate enough for a simple 1:1 translation without
3583 replacements, if we need more, we'll resize */
3584 res = PyUnicode_FromUnicode(NULL, size);
3585 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003586 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 return res;
3589 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003591 while (p<endp) {
3592 /* try to encode it */
3593 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003594 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003595 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596 goto onError;
3597 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003598 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 if (x!=Py_None) /* it worked => adjust input pointer */
3600 ++p;
3601 else { /* untranslatable character */
3602 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003603 Py_ssize_t repsize;
3604 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 Py_UNICODE *uni2;
3606 /* startpos for collecting untranslatable chars */
3607 const Py_UNICODE *collstart = p;
3608 const Py_UNICODE *collend = p+1;
3609 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 /* find all untranslatable characters */
3612 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003613 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614 goto onError;
3615 Py_XDECREF(x);
3616 if (x!=Py_None)
3617 break;
3618 ++collend;
3619 }
3620 /* cache callback name lookup
3621 * (if not done yet, i.e. it's the first error) */
3622 if (known_errorHandler==-1) {
3623 if ((errors==NULL) || (!strcmp(errors, "strict")))
3624 known_errorHandler = 1;
3625 else if (!strcmp(errors, "replace"))
3626 known_errorHandler = 2;
3627 else if (!strcmp(errors, "ignore"))
3628 known_errorHandler = 3;
3629 else if (!strcmp(errors, "xmlcharrefreplace"))
3630 known_errorHandler = 4;
3631 else
3632 known_errorHandler = 0;
3633 }
3634 switch (known_errorHandler) {
3635 case 1: /* strict */
3636 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3637 goto onError;
3638 case 2: /* replace */
3639 /* No need to check for space, this is a 1:1 replacement */
3640 for (coll = collstart; coll<collend; ++coll)
3641 *str++ = '?';
3642 /* fall through */
3643 case 3: /* ignore */
3644 p = collend;
3645 break;
3646 case 4: /* xmlcharrefreplace */
3647 /* generate replacement (temporarily (mis)uses p) */
3648 for (p = collstart; p < collend; ++p) {
3649 char buffer[2+29+1+1];
3650 char *cp;
3651 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003652 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3654 goto onError;
3655 for (cp = buffer; *cp; ++cp)
3656 *str++ = *cp;
3657 }
3658 p = collend;
3659 break;
3660 default:
3661 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3662 reason, startp, size, &exc,
3663 collstart-startp, collend-startp, &newpos);
3664 if (repunicode == NULL)
3665 goto onError;
3666 /* generate replacement */
3667 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003668 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3670 Py_DECREF(repunicode);
3671 goto onError;
3672 }
3673 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3674 *str++ = *uni2;
3675 p = startp + newpos;
3676 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 }
3678 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 /* Resize if we allocated to much */
3681 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003682 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003683 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003684 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 }
3686 Py_XDECREF(exc);
3687 Py_XDECREF(errorHandler);
3688 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003690 onError:
3691 Py_XDECREF(res);
3692 Py_XDECREF(exc);
3693 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694 return NULL;
3695}
3696
3697PyObject *PyUnicode_Translate(PyObject *str,
3698 PyObject *mapping,
3699 const char *errors)
3700{
3701 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003702
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703 str = PyUnicode_FromObject(str);
3704 if (str == NULL)
3705 goto onError;
3706 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3707 PyUnicode_GET_SIZE(str),
3708 mapping,
3709 errors);
3710 Py_DECREF(str);
3711 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003712
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 onError:
3714 Py_XDECREF(str);
3715 return NULL;
3716}
Tim Petersced69f82003-09-16 20:30:58 +00003717
Guido van Rossum9e896b32000-04-05 20:11:21 +00003718/* --- Decimal Encoder ---------------------------------------------------- */
3719
3720int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003721 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00003722 char *output,
3723 const char *errors)
3724{
3725 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 PyObject *errorHandler = NULL;
3727 PyObject *exc = NULL;
3728 const char *encoding = "decimal";
3729 const char *reason = "invalid decimal Unicode string";
3730 /* the following variable is used for caching string comparisons
3731 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3732 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003733
3734 if (output == NULL) {
3735 PyErr_BadArgument();
3736 return -1;
3737 }
3738
3739 p = s;
3740 end = s + length;
3741 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003742 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003743 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003744 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003745 Py_ssize_t repsize;
3746 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003747 Py_UNICODE *uni2;
3748 Py_UNICODE *collstart;
3749 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003750
Guido van Rossum9e896b32000-04-05 20:11:21 +00003751 if (Py_UNICODE_ISSPACE(ch)) {
3752 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003753 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003754 continue;
3755 }
3756 decimal = Py_UNICODE_TODECIMAL(ch);
3757 if (decimal >= 0) {
3758 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003759 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003760 continue;
3761 }
Guido van Rossumba477042000-04-06 18:18:10 +00003762 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003763 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003765 continue;
3766 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003767 /* All other characters are considered unencodable */
3768 collstart = p;
3769 collend = p+1;
3770 while (collend < end) {
3771 if ((0 < *collend && *collend < 256) ||
3772 !Py_UNICODE_ISSPACE(*collend) ||
3773 Py_UNICODE_TODECIMAL(*collend))
3774 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003775 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003776 /* cache callback name lookup
3777 * (if not done yet, i.e. it's the first error) */
3778 if (known_errorHandler==-1) {
3779 if ((errors==NULL) || (!strcmp(errors, "strict")))
3780 known_errorHandler = 1;
3781 else if (!strcmp(errors, "replace"))
3782 known_errorHandler = 2;
3783 else if (!strcmp(errors, "ignore"))
3784 known_errorHandler = 3;
3785 else if (!strcmp(errors, "xmlcharrefreplace"))
3786 known_errorHandler = 4;
3787 else
3788 known_errorHandler = 0;
3789 }
3790 switch (known_errorHandler) {
3791 case 1: /* strict */
3792 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3793 goto onError;
3794 case 2: /* replace */
3795 for (p = collstart; p < collend; ++p)
3796 *output++ = '?';
3797 /* fall through */
3798 case 3: /* ignore */
3799 p = collend;
3800 break;
3801 case 4: /* xmlcharrefreplace */
3802 /* generate replacement (temporarily (mis)uses p) */
3803 for (p = collstart; p < collend; ++p)
3804 output += sprintf(output, "&#%d;", (int)*p);
3805 p = collend;
3806 break;
3807 default:
3808 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3809 encoding, reason, s, length, &exc,
3810 collstart-s, collend-s, &newpos);
3811 if (repunicode == NULL)
3812 goto onError;
3813 /* generate replacement */
3814 repsize = PyUnicode_GET_SIZE(repunicode);
3815 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3816 Py_UNICODE ch = *uni2;
3817 if (Py_UNICODE_ISSPACE(ch))
3818 *output++ = ' ';
3819 else {
3820 decimal = Py_UNICODE_TODECIMAL(ch);
3821 if (decimal >= 0)
3822 *output++ = '0' + decimal;
3823 else if (0 < ch && ch < 256)
3824 *output++ = (char)ch;
3825 else {
3826 Py_DECREF(repunicode);
3827 raise_encode_exception(&exc, encoding,
3828 s, length, collstart-s, collend-s, reason);
3829 goto onError;
3830 }
3831 }
3832 }
3833 p = s + newpos;
3834 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003835 }
3836 }
3837 /* 0-terminate the output string */
3838 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003839 Py_XDECREF(exc);
3840 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003841 return 0;
3842
3843 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003844 Py_XDECREF(exc);
3845 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003846 return -1;
3847}
3848
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849/* --- Helpers ------------------------------------------------------------ */
3850
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003851#define USE_FAST /* experimental fast search implementation */
3852
3853/* fast search/count implementation, based on a mix between boyer-
3854 moore and horspool, with a few more bells and whistles on the top.
3855 for some more background, see: http://effbot.org/stringlib */
3856
3857#define FAST_COUNT 0
3858#define FAST_SEARCH 1
3859
3860LOCAL(int) fastsearch(Py_UNICODE* s, Py_ssize_t n,
3861 Py_UNICODE* p, Py_ssize_t m,
3862 int mode)
3863{
3864 long mask;
3865 int skip, count = 0;
3866 Py_ssize_t i, j, mlast, w;
3867
3868 w = n - m;
3869
3870 if (w < 0)
3871 return -1;
3872
3873 /* look for special cases */
3874 if (m <= 1) {
3875 if (m < 0)
3876 return -1;
3877 /* use special case for 1-character strings */
3878 if (mode == FAST_COUNT) {
3879 for (i = 0; i < n; i++)
3880 if (s[i] == p[0])
3881 count++;
3882 return count;
3883 } else {
3884 for (i = 0; i < n; i++)
3885 if (s[i] == p[0])
3886 return i;
3887 }
3888 return -1;
3889 }
3890
3891 mlast = m - 1;
3892
3893 /* create compressed boyer-moore delta 1 table */
3894 skip = mlast - 1;
3895 /* process pattern[:-1] */
3896 for (mask = i = 0; i < mlast; i++) {
3897 mask |= (1 << (p[i] & 0x1F));
3898 if (p[i] == p[mlast])
3899 skip = mlast - i - 1;
3900 }
3901 /* process pattern[-1] outside the loop */
3902 mask |= (1 << (p[mlast] & 0x1F));
3903
3904 for (i = 0; i <= w; i++) {
3905 /* note: using mlast in the skip path slows things down on x86 */
3906 if (s[i+m-1] == p[m-1]) {
3907 /* candidate match */
3908 for (j = 0; j < mlast; j++)
3909 if (s[i+j] != p[j])
3910 break;
3911 if (j == mlast) {
3912 /* got a match! */
3913 if (mode != FAST_COUNT)
3914 return i;
3915 count++;
3916 i = i + mlast;
3917 continue;
3918 }
3919 /* miss: check if next character is part of pattern */
3920 if (!(mask & (1 << (s[i+m] & 0x1F))))
3921 i = i + m;
3922 else {
3923 i = i + skip;
3924 continue;
3925 }
3926 } else {
3927 /* skip: check if next character is part of pattern */
3928 if (!(mask & (1 << (s[i+m] & 0x1F))))
3929 i = i + m;
3930 }
3931 }
3932
3933 if (mode != FAST_COUNT)
3934 return -1;
3935 return count;
3936}
3937
3938LOCAL(Py_ssize_t) count(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003939 Py_ssize_t start,
3940 Py_ssize_t end,
3941 PyUnicodeObject *substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003943 Py_ssize_t count = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003945 if (start < 0)
3946 start += self->length;
3947 if (start < 0)
3948 start = 0;
3949 if (end > self->length)
3950 end = self->length;
3951 if (end < 0)
3952 end += self->length;
3953 if (end < 0)
3954 end = 0;
3955
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003956 if (substring->length == 0)
3957 return (end - start + 1);
3958
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003959#ifdef USE_FAST
3960 count = fastsearch(
3961 PyUnicode_AS_UNICODE(self) + start, end - start,
3962 substring->str, substring->length, FAST_COUNT
3963 );
3964 if (count < 0)
3965 count = 0; /* no match */
3966#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967 end -= substring->length;
3968
3969 while (start <= end)
3970 if (Py_UNICODE_MATCH(self, start, substring)) {
3971 count++;
3972 start += substring->length;
3973 } else
3974 start++;
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003975#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976
3977 return count;
3978}
3979
Martin v. Löwis18e16552006-02-15 17:27:45 +00003980Py_ssize_t PyUnicode_Count(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003981 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003982 Py_ssize_t start,
3983 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003985 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003986
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987 str = PyUnicode_FromObject(str);
3988 if (str == NULL)
3989 return -1;
3990 substr = PyUnicode_FromObject(substr);
3991 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003992 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993 return -1;
3994 }
Tim Petersced69f82003-09-16 20:30:58 +00003995
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 result = count((PyUnicodeObject *)str,
3997 start, end,
3998 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003999
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 Py_DECREF(str);
4001 Py_DECREF(substr);
4002 return result;
4003}
4004
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004005static Py_ssize_t findstring(PyUnicodeObject *self,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004006 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004007 Py_ssize_t start,
4008 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009 int direction)
4010{
4011 if (start < 0)
4012 start += self->length;
4013 if (start < 0)
4014 start = 0;
4015
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016 if (end > self->length)
4017 end = self->length;
4018 if (end < 0)
4019 end += self->length;
4020 if (end < 0)
4021 end = 0;
4022
Guido van Rossum76afbd92002-08-20 17:29:29 +00004023 if (substring->length == 0)
4024 return (direction > 0) ? start : end;
4025
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004026#ifdef USE_FAST
4027 if (direction > 0) {
4028 Py_ssize_t pos = fastsearch(
4029 PyUnicode_AS_UNICODE(self) + start, end - start,
4030 substring->str, substring->length, FAST_SEARCH
4031 );
4032 if (pos < 0)
4033 return pos;
4034 return pos + start;
4035 }
4036#endif
4037
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038 end -= substring->length;
4039
4040 if (direction < 0) {
4041 for (; end >= start; end--)
4042 if (Py_UNICODE_MATCH(self, end, substring))
4043 return end;
4044 } else {
4045 for (; start <= end; start++)
4046 if (Py_UNICODE_MATCH(self, start, substring))
4047 return start;
4048 }
4049
4050 return -1;
4051}
4052
Martin v. Löwis18e16552006-02-15 17:27:45 +00004053Py_ssize_t PyUnicode_Find(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004054 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004055 Py_ssize_t start,
4056 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057 int direction)
4058{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004059 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004060
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061 str = PyUnicode_FromObject(str);
4062 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004063 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 substr = PyUnicode_FromObject(substr);
4065 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004066 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004067 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068 }
Tim Petersced69f82003-09-16 20:30:58 +00004069
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 result = findstring((PyUnicodeObject *)str,
4071 (PyUnicodeObject *)substr,
4072 start, end, direction);
4073 Py_DECREF(str);
4074 Py_DECREF(substr);
4075 return result;
4076}
4077
Tim Petersced69f82003-09-16 20:30:58 +00004078static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004079int tailmatch(PyUnicodeObject *self,
4080 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004081 Py_ssize_t start,
4082 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083 int direction)
4084{
4085 if (start < 0)
4086 start += self->length;
4087 if (start < 0)
4088 start = 0;
4089
4090 if (substring->length == 0)
4091 return 1;
4092
4093 if (end > self->length)
4094 end = self->length;
4095 if (end < 0)
4096 end += self->length;
4097 if (end < 0)
4098 end = 0;
4099
4100 end -= substring->length;
4101 if (end < start)
4102 return 0;
4103
4104 if (direction > 0) {
4105 if (Py_UNICODE_MATCH(self, end, substring))
4106 return 1;
4107 } else {
4108 if (Py_UNICODE_MATCH(self, start, substring))
4109 return 1;
4110 }
4111
4112 return 0;
4113}
4114
Martin v. Löwis18e16552006-02-15 17:27:45 +00004115Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004117 Py_ssize_t start,
4118 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119 int direction)
4120{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004121 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004122
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123 str = PyUnicode_FromObject(str);
4124 if (str == NULL)
4125 return -1;
4126 substr = PyUnicode_FromObject(substr);
4127 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004128 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129 return -1;
4130 }
Tim Petersced69f82003-09-16 20:30:58 +00004131
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132 result = tailmatch((PyUnicodeObject *)str,
4133 (PyUnicodeObject *)substr,
4134 start, end, direction);
4135 Py_DECREF(str);
4136 Py_DECREF(substr);
4137 return result;
4138}
4139
Tim Petersced69f82003-09-16 20:30:58 +00004140static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004142 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143 Py_UNICODE ch)
4144{
4145 /* like wcschr, but doesn't stop at NULL characters */
4146
4147 while (size-- > 0) {
4148 if (*s == ch)
4149 return s;
4150 s++;
4151 }
4152
4153 return NULL;
4154}
4155
4156/* Apply fixfct filter to the Unicode object self and return a
4157 reference to the modified object */
4158
Tim Petersced69f82003-09-16 20:30:58 +00004159static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160PyObject *fixup(PyUnicodeObject *self,
4161 int (*fixfct)(PyUnicodeObject *s))
4162{
4163
4164 PyUnicodeObject *u;
4165
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004166 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167 if (u == NULL)
4168 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004169
4170 Py_UNICODE_COPY(u->str, self->str, self->length);
4171
Tim Peters7a29bd52001-09-12 03:03:31 +00004172 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173 /* fixfct should return TRUE if it modified the buffer. If
4174 FALSE, return a reference to the original buffer instead
4175 (to save space, not time) */
4176 Py_INCREF(self);
4177 Py_DECREF(u);
4178 return (PyObject*) self;
4179 }
4180 return (PyObject*) u;
4181}
4182
Tim Petersced69f82003-09-16 20:30:58 +00004183static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184int fixupper(PyUnicodeObject *self)
4185{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004186 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004187 Py_UNICODE *s = self->str;
4188 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004189
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190 while (len-- > 0) {
4191 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004192
Guido van Rossumd57fd912000-03-10 22:53:23 +00004193 ch = Py_UNICODE_TOUPPER(*s);
4194 if (ch != *s) {
4195 status = 1;
4196 *s = ch;
4197 }
4198 s++;
4199 }
4200
4201 return status;
4202}
4203
Tim Petersced69f82003-09-16 20:30:58 +00004204static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205int fixlower(PyUnicodeObject *self)
4206{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004207 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004208 Py_UNICODE *s = self->str;
4209 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004210
Guido van Rossumd57fd912000-03-10 22:53:23 +00004211 while (len-- > 0) {
4212 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004213
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214 ch = Py_UNICODE_TOLOWER(*s);
4215 if (ch != *s) {
4216 status = 1;
4217 *s = ch;
4218 }
4219 s++;
4220 }
4221
4222 return status;
4223}
4224
Tim Petersced69f82003-09-16 20:30:58 +00004225static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004226int fixswapcase(PyUnicodeObject *self)
4227{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004228 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004229 Py_UNICODE *s = self->str;
4230 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004231
Guido van Rossumd57fd912000-03-10 22:53:23 +00004232 while (len-- > 0) {
4233 if (Py_UNICODE_ISUPPER(*s)) {
4234 *s = Py_UNICODE_TOLOWER(*s);
4235 status = 1;
4236 } else if (Py_UNICODE_ISLOWER(*s)) {
4237 *s = Py_UNICODE_TOUPPER(*s);
4238 status = 1;
4239 }
4240 s++;
4241 }
4242
4243 return status;
4244}
4245
Tim Petersced69f82003-09-16 20:30:58 +00004246static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004247int fixcapitalize(PyUnicodeObject *self)
4248{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004249 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004250 Py_UNICODE *s = self->str;
4251 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004252
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004253 if (len == 0)
4254 return 0;
4255 if (Py_UNICODE_ISLOWER(*s)) {
4256 *s = Py_UNICODE_TOUPPER(*s);
4257 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004258 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004259 s++;
4260 while (--len > 0) {
4261 if (Py_UNICODE_ISUPPER(*s)) {
4262 *s = Py_UNICODE_TOLOWER(*s);
4263 status = 1;
4264 }
4265 s++;
4266 }
4267 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268}
4269
4270static
4271int fixtitle(PyUnicodeObject *self)
4272{
4273 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4274 register Py_UNICODE *e;
4275 int previous_is_cased;
4276
4277 /* Shortcut for single character strings */
4278 if (PyUnicode_GET_SIZE(self) == 1) {
4279 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4280 if (*p != ch) {
4281 *p = ch;
4282 return 1;
4283 }
4284 else
4285 return 0;
4286 }
Tim Petersced69f82003-09-16 20:30:58 +00004287
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288 e = p + PyUnicode_GET_SIZE(self);
4289 previous_is_cased = 0;
4290 for (; p < e; p++) {
4291 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004292
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293 if (previous_is_cased)
4294 *p = Py_UNICODE_TOLOWER(ch);
4295 else
4296 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004297
4298 if (Py_UNICODE_ISLOWER(ch) ||
4299 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300 Py_UNICODE_ISTITLE(ch))
4301 previous_is_cased = 1;
4302 else
4303 previous_is_cased = 0;
4304 }
4305 return 1;
4306}
4307
Tim Peters8ce9f162004-08-27 01:49:32 +00004308PyObject *
4309PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004310{
Tim Peters8ce9f162004-08-27 01:49:32 +00004311 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004312 const Py_UNICODE blank = ' ';
4313 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004314 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004315 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004316 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4317 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004318 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4319 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004320 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004321 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004322 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323
Tim Peters05eba1f2004-08-27 21:32:02 +00004324 fseq = PySequence_Fast(seq, "");
4325 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004326 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004327 }
4328
Tim Peters91879ab2004-08-27 22:35:44 +00004329 /* Grrrr. A codec may be invoked to convert str objects to
4330 * Unicode, and so it's possible to call back into Python code
4331 * during PyUnicode_FromObject(), and so it's possible for a sick
4332 * codec to change the size of fseq (if seq is a list). Therefore
4333 * we have to keep refetching the size -- can't assume seqlen
4334 * is invariant.
4335 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004336 seqlen = PySequence_Fast_GET_SIZE(fseq);
4337 /* If empty sequence, return u"". */
4338 if (seqlen == 0) {
4339 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4340 goto Done;
4341 }
4342 /* If singleton sequence with an exact Unicode, return that. */
4343 if (seqlen == 1) {
4344 item = PySequence_Fast_GET_ITEM(fseq, 0);
4345 if (PyUnicode_CheckExact(item)) {
4346 Py_INCREF(item);
4347 res = (PyUnicodeObject *)item;
4348 goto Done;
4349 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004350 }
4351
Tim Peters05eba1f2004-08-27 21:32:02 +00004352 /* At least two items to join, or one that isn't exact Unicode. */
4353 if (seqlen > 1) {
4354 /* Set up sep and seplen -- they're needed. */
4355 if (separator == NULL) {
4356 sep = &blank;
4357 seplen = 1;
4358 }
4359 else {
4360 internal_separator = PyUnicode_FromObject(separator);
4361 if (internal_separator == NULL)
4362 goto onError;
4363 sep = PyUnicode_AS_UNICODE(internal_separator);
4364 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004365 /* In case PyUnicode_FromObject() mutated seq. */
4366 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004367 }
4368 }
4369
4370 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004371 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004372 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004373 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004374 res_p = PyUnicode_AS_UNICODE(res);
4375 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004376
Tim Peters05eba1f2004-08-27 21:32:02 +00004377 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004378 Py_ssize_t itemlen;
4379 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004380
4381 item = PySequence_Fast_GET_ITEM(fseq, i);
4382 /* Convert item to Unicode. */
4383 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4384 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004385 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004386 " %.80s found",
4387 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004388 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004389 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004390 item = PyUnicode_FromObject(item);
4391 if (item == NULL)
4392 goto onError;
4393 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004394
Tim Peters91879ab2004-08-27 22:35:44 +00004395 /* In case PyUnicode_FromObject() mutated seq. */
4396 seqlen = PySequence_Fast_GET_SIZE(fseq);
4397
Tim Peters8ce9f162004-08-27 01:49:32 +00004398 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004400 new_res_used = res_used + itemlen;
Tim Peters286085c2006-05-22 19:17:04 +00004401 if (new_res_used <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004402 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004403 if (i < seqlen - 1) {
4404 new_res_used += seplen;
Tim Peters286085c2006-05-22 19:17:04 +00004405 if (new_res_used <= 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004406 goto Overflow;
4407 }
4408 if (new_res_used > res_alloc) {
4409 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004410 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004411 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004412 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004413 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004414 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004415 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004416 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004418 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004419 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004421
4422 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004423 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004424 res_p += itemlen;
4425 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004426 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004427 res_p += seplen;
4428 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004430 res_used = new_res_used;
4431 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004432
Tim Peters05eba1f2004-08-27 21:32:02 +00004433 /* Shrink res to match the used area; this probably can't fail,
4434 * but it's cheap to check.
4435 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004436 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004437 goto onError;
4438
4439 Done:
4440 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004441 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442 return (PyObject *)res;
4443
Tim Peters8ce9f162004-08-27 01:49:32 +00004444 Overflow:
4445 PyErr_SetString(PyExc_OverflowError,
4446 "join() is too long for a Python string");
4447 Py_DECREF(item);
4448 /* fall through */
4449
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004451 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004452 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004453 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454 return NULL;
4455}
4456
Tim Petersced69f82003-09-16 20:30:58 +00004457static
4458PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004459 Py_ssize_t left,
4460 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461 Py_UNICODE fill)
4462{
4463 PyUnicodeObject *u;
4464
4465 if (left < 0)
4466 left = 0;
4467 if (right < 0)
4468 right = 0;
4469
Tim Peters7a29bd52001-09-12 03:03:31 +00004470 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 Py_INCREF(self);
4472 return self;
4473 }
4474
4475 u = _PyUnicode_New(left + self->length + right);
4476 if (u) {
4477 if (left)
4478 Py_UNICODE_FILL(u->str, fill, left);
4479 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4480 if (right)
4481 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4482 }
4483
4484 return u;
4485}
4486
4487#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004488 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 if (!str) \
4490 goto onError; \
4491 if (PyList_Append(list, str)) { \
4492 Py_DECREF(str); \
4493 goto onError; \
4494 } \
4495 else \
4496 Py_DECREF(str);
4497
4498static
4499PyObject *split_whitespace(PyUnicodeObject *self,
4500 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004501 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004503 register Py_ssize_t i;
4504 register Py_ssize_t j;
4505 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506 PyObject *str;
4507
4508 for (i = j = 0; i < len; ) {
4509 /* find a token */
4510 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4511 i++;
4512 j = i;
4513 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4514 i++;
4515 if (j < i) {
4516 if (maxcount-- <= 0)
4517 break;
4518 SPLIT_APPEND(self->str, j, i);
4519 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4520 i++;
4521 j = i;
4522 }
4523 }
4524 if (j < len) {
4525 SPLIT_APPEND(self->str, j, len);
4526 }
4527 return list;
4528
4529 onError:
4530 Py_DECREF(list);
4531 return NULL;
4532}
4533
4534PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004535 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004537 register Py_ssize_t i;
4538 register Py_ssize_t j;
4539 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540 PyObject *list;
4541 PyObject *str;
4542 Py_UNICODE *data;
4543
4544 string = PyUnicode_FromObject(string);
4545 if (string == NULL)
4546 return NULL;
4547 data = PyUnicode_AS_UNICODE(string);
4548 len = PyUnicode_GET_SIZE(string);
4549
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550 list = PyList_New(0);
4551 if (!list)
4552 goto onError;
4553
4554 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004555 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004556
Guido van Rossumd57fd912000-03-10 22:53:23 +00004557 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004558 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560
4561 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004562 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563 if (i < len) {
4564 if (data[i] == '\r' && i + 1 < len &&
4565 data[i+1] == '\n')
4566 i += 2;
4567 else
4568 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004569 if (keepends)
4570 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004571 }
Guido van Rossum86662912000-04-11 15:38:46 +00004572 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573 j = i;
4574 }
4575 if (j < len) {
4576 SPLIT_APPEND(data, j, len);
4577 }
4578
4579 Py_DECREF(string);
4580 return list;
4581
4582 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004583 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584 Py_DECREF(string);
4585 return NULL;
4586}
4587
Tim Petersced69f82003-09-16 20:30:58 +00004588static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004589PyObject *split_char(PyUnicodeObject *self,
4590 PyObject *list,
4591 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004592 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004593{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004594 register Py_ssize_t i;
4595 register Py_ssize_t j;
4596 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597 PyObject *str;
4598
4599 for (i = j = 0; i < len; ) {
4600 if (self->str[i] == ch) {
4601 if (maxcount-- <= 0)
4602 break;
4603 SPLIT_APPEND(self->str, j, i);
4604 i = j = i + 1;
4605 } else
4606 i++;
4607 }
4608 if (j <= len) {
4609 SPLIT_APPEND(self->str, j, len);
4610 }
4611 return list;
4612
4613 onError:
4614 Py_DECREF(list);
4615 return NULL;
4616}
4617
Tim Petersced69f82003-09-16 20:30:58 +00004618static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004619PyObject *split_substring(PyUnicodeObject *self,
4620 PyObject *list,
4621 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004622 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004624 register Py_ssize_t i;
4625 register Py_ssize_t j;
4626 Py_ssize_t len = self->length;
4627 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628 PyObject *str;
4629
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004630 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004631 if (Py_UNICODE_MATCH(self, i, substring)) {
4632 if (maxcount-- <= 0)
4633 break;
4634 SPLIT_APPEND(self->str, j, i);
4635 i = j = i + sublen;
4636 } else
4637 i++;
4638 }
4639 if (j <= len) {
4640 SPLIT_APPEND(self->str, j, len);
4641 }
4642 return list;
4643
4644 onError:
4645 Py_DECREF(list);
4646 return NULL;
4647}
4648
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004649static
4650PyObject *rsplit_whitespace(PyUnicodeObject *self,
4651 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004652 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004653{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004654 register Py_ssize_t i;
4655 register Py_ssize_t j;
4656 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004657 PyObject *str;
4658
4659 for (i = j = len - 1; i >= 0; ) {
4660 /* find a token */
4661 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4662 i--;
4663 j = i;
4664 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4665 i--;
4666 if (j > i) {
4667 if (maxcount-- <= 0)
4668 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004669 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004670 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4671 i--;
4672 j = i;
4673 }
4674 }
4675 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004676 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004677 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004678 if (PyList_Reverse(list) < 0)
4679 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004680 return list;
4681
4682 onError:
4683 Py_DECREF(list);
4684 return NULL;
4685}
4686
4687static
4688PyObject *rsplit_char(PyUnicodeObject *self,
4689 PyObject *list,
4690 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004691 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004692{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004693 register Py_ssize_t i;
4694 register Py_ssize_t j;
4695 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004696 PyObject *str;
4697
4698 for (i = j = len - 1; i >= 0; ) {
4699 if (self->str[i] == ch) {
4700 if (maxcount-- <= 0)
4701 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004702 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004703 j = i = i - 1;
4704 } else
4705 i--;
4706 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004707 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004708 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004709 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004710 if (PyList_Reverse(list) < 0)
4711 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004712 return list;
4713
4714 onError:
4715 Py_DECREF(list);
4716 return NULL;
4717}
4718
4719static
4720PyObject *rsplit_substring(PyUnicodeObject *self,
4721 PyObject *list,
4722 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004723 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004724{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004725 register Py_ssize_t i;
4726 register Py_ssize_t j;
4727 Py_ssize_t len = self->length;
4728 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004729 PyObject *str;
4730
4731 for (i = len - sublen, j = len; i >= 0; ) {
4732 if (Py_UNICODE_MATCH(self, i, substring)) {
4733 if (maxcount-- <= 0)
4734 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004735 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004736 j = i;
4737 i -= sublen;
4738 } else
4739 i--;
4740 }
4741 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004742 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004743 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004744 if (PyList_Reverse(list) < 0)
4745 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004746 return list;
4747
4748 onError:
4749 Py_DECREF(list);
4750 return NULL;
4751}
4752
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753#undef SPLIT_APPEND
4754
4755static
4756PyObject *split(PyUnicodeObject *self,
4757 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004758 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759{
4760 PyObject *list;
4761
4762 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004763 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764
4765 list = PyList_New(0);
4766 if (!list)
4767 return NULL;
4768
4769 if (substring == NULL)
4770 return split_whitespace(self,list,maxcount);
4771
4772 else if (substring->length == 1)
4773 return split_char(self,list,substring->str[0],maxcount);
4774
4775 else if (substring->length == 0) {
4776 Py_DECREF(list);
4777 PyErr_SetString(PyExc_ValueError, "empty separator");
4778 return NULL;
4779 }
4780 else
4781 return split_substring(self,list,substring,maxcount);
4782}
4783
Tim Petersced69f82003-09-16 20:30:58 +00004784static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004785PyObject *rsplit(PyUnicodeObject *self,
4786 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004787 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004788{
4789 PyObject *list;
4790
4791 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004792 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004793
4794 list = PyList_New(0);
4795 if (!list)
4796 return NULL;
4797
4798 if (substring == NULL)
4799 return rsplit_whitespace(self,list,maxcount);
4800
4801 else if (substring->length == 1)
4802 return rsplit_char(self,list,substring->str[0],maxcount);
4803
4804 else if (substring->length == 0) {
4805 Py_DECREF(list);
4806 PyErr_SetString(PyExc_ValueError, "empty separator");
4807 return NULL;
4808 }
4809 else
4810 return rsplit_substring(self,list,substring,maxcount);
4811}
4812
4813static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814PyObject *replace(PyUnicodeObject *self,
4815 PyUnicodeObject *str1,
4816 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004817 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818{
4819 PyUnicodeObject *u;
4820
4821 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004822 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823
4824 if (str1->length == 1 && str2->length == 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004825 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826
4827 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004828 if (!findchar(self->str, self->length, str1->str[0]) &&
4829 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830 /* nothing to replace, return original string */
4831 Py_INCREF(self);
4832 u = self;
4833 } else {
4834 Py_UNICODE u1 = str1->str[0];
4835 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004836
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004838 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839 self->length
4840 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004841 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004842 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004843 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844 for (i = 0; i < u->length; i++)
4845 if (u->str[i] == u1) {
4846 if (--maxcount < 0)
4847 break;
4848 u->str[i] = u2;
4849 }
4850 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004851 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852
4853 } else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004854 Py_ssize_t n, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 Py_UNICODE *p;
4856
4857 /* replace strings */
4858 n = count(self, 0, self->length, str1);
4859 if (n > maxcount)
4860 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004861 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004863 if (PyUnicode_CheckExact(self)) {
4864 Py_INCREF(self);
4865 u = self;
4866 }
4867 else {
4868 u = (PyUnicodeObject *)
4869 PyUnicode_FromUnicode(self->str, self->length);
4870 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871 } else {
4872 u = _PyUnicode_New(
4873 self->length + n * (str2->length - str1->length));
4874 if (u) {
4875 i = 0;
4876 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004877 if (str1->length > 0) {
4878 while (i <= self->length - str1->length)
4879 if (Py_UNICODE_MATCH(self, i, str1)) {
4880 /* replace string segment */
4881 Py_UNICODE_COPY(p, str2->str, str2->length);
4882 p += str2->length;
4883 i += str1->length;
4884 if (--n <= 0) {
4885 /* copy remaining part */
4886 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4887 break;
4888 }
4889 } else
4890 *p++ = self->str[i++];
4891 } else {
4892 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893 Py_UNICODE_COPY(p, str2->str, str2->length);
4894 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004895 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004898 }
4899 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4900 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 }
4902 }
4903 }
Tim Petersced69f82003-09-16 20:30:58 +00004904
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905 return (PyObject *) u;
4906}
4907
4908/* --- Unicode Object Methods --------------------------------------------- */
4909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004910PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911"S.title() -> unicode\n\
4912\n\
4913Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004914characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915
4916static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004917unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919 return fixup(self, fixtitle);
4920}
4921
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004922PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923"S.capitalize() -> unicode\n\
4924\n\
4925Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004926have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927
4928static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004929unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931 return fixup(self, fixcapitalize);
4932}
4933
4934#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004935PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936"S.capwords() -> unicode\n\
4937\n\
4938Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004939normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940
4941static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004942unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943{
4944 PyObject *list;
4945 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004946 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948 /* Split into words */
4949 list = split(self, NULL, -1);
4950 if (!list)
4951 return NULL;
4952
4953 /* Capitalize each word */
4954 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4955 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4956 fixcapitalize);
4957 if (item == NULL)
4958 goto onError;
4959 Py_DECREF(PyList_GET_ITEM(list, i));
4960 PyList_SET_ITEM(list, i, item);
4961 }
4962
4963 /* Join the words to form a new string */
4964 item = PyUnicode_Join(NULL, list);
4965
4966onError:
4967 Py_DECREF(list);
4968 return (PyObject *)item;
4969}
4970#endif
4971
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004972/* Argument converter. Coerces to a single unicode character */
4973
4974static int
4975convert_uc(PyObject *obj, void *addr)
4976{
4977 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4978 PyObject *uniobj;
4979 Py_UNICODE *unistr;
4980
4981 uniobj = PyUnicode_FromObject(obj);
4982 if (uniobj == NULL) {
4983 PyErr_SetString(PyExc_TypeError,
4984 "The fill character cannot be converted to Unicode");
4985 return 0;
4986 }
4987 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4988 PyErr_SetString(PyExc_TypeError,
4989 "The fill character must be exactly one character long");
4990 Py_DECREF(uniobj);
4991 return 0;
4992 }
4993 unistr = PyUnicode_AS_UNICODE(uniobj);
4994 *fillcharloc = unistr[0];
4995 Py_DECREF(uniobj);
4996 return 1;
4997}
4998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004999PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005000"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005002Return S centered in a Unicode string of length width. Padding is\n\
5003done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005004
5005static PyObject *
5006unicode_center(PyUnicodeObject *self, PyObject *args)
5007{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005008 Py_ssize_t marg, left;
5009 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005010 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005011
Thomas Woutersde017742006-02-16 19:34:37 +00005012 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013 return NULL;
5014
Tim Peters7a29bd52001-09-12 03:03:31 +00005015 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016 Py_INCREF(self);
5017 return (PyObject*) self;
5018 }
5019
5020 marg = width - self->length;
5021 left = marg / 2 + (marg & width & 1);
5022
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005023 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024}
5025
Marc-André Lemburge5034372000-08-08 08:04:29 +00005026#if 0
5027
5028/* This code should go into some future Unicode collation support
5029 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005030 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005031
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005032/* speedy UTF-16 code point order comparison */
5033/* gleaned from: */
5034/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5035
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005036static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005037{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005038 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005039 0, 0, 0, 0, 0, 0, 0, 0,
5040 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005041 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005042};
5043
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044static int
5045unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5046{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005047 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005048
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049 Py_UNICODE *s1 = str1->str;
5050 Py_UNICODE *s2 = str2->str;
5051
5052 len1 = str1->length;
5053 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005054
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005056 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005057
5058 c1 = *s1++;
5059 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005060
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005061 if (c1 > (1<<11) * 26)
5062 c1 += utf16Fixup[c1>>11];
5063 if (c2 > (1<<11) * 26)
5064 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005065 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005066
5067 if (c1 != c2)
5068 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005069
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005070 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071 }
5072
5073 return (len1 < len2) ? -1 : (len1 != len2);
5074}
5075
Marc-André Lemburge5034372000-08-08 08:04:29 +00005076#else
5077
5078static int
5079unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5080{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005081 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005082
5083 Py_UNICODE *s1 = str1->str;
5084 Py_UNICODE *s2 = str2->str;
5085
5086 len1 = str1->length;
5087 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005088
Marc-André Lemburge5034372000-08-08 08:04:29 +00005089 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005090 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005091
Fredrik Lundh45714e92001-06-26 16:39:36 +00005092 c1 = *s1++;
5093 c2 = *s2++;
5094
5095 if (c1 != c2)
5096 return (c1 < c2) ? -1 : 1;
5097
Marc-André Lemburge5034372000-08-08 08:04:29 +00005098 len1--; len2--;
5099 }
5100
5101 return (len1 < len2) ? -1 : (len1 != len2);
5102}
5103
5104#endif
5105
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106int PyUnicode_Compare(PyObject *left,
5107 PyObject *right)
5108{
5109 PyUnicodeObject *u = NULL, *v = NULL;
5110 int result;
5111
5112 /* Coerce the two arguments */
5113 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5114 if (u == NULL)
5115 goto onError;
5116 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5117 if (v == NULL)
5118 goto onError;
5119
Thomas Wouters7e474022000-07-16 12:04:32 +00005120 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121 if (v == u) {
5122 Py_DECREF(u);
5123 Py_DECREF(v);
5124 return 0;
5125 }
5126
5127 result = unicode_compare(u, v);
5128
5129 Py_DECREF(u);
5130 Py_DECREF(v);
5131 return result;
5132
5133onError:
5134 Py_XDECREF(u);
5135 Py_XDECREF(v);
5136 return -1;
5137}
5138
Guido van Rossum403d68b2000-03-13 15:55:09 +00005139int PyUnicode_Contains(PyObject *container,
5140 PyObject *element)
5141{
Fredrik Lundh833bf942006-05-23 10:12:21 +00005142 PyUnicodeObject *u, *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005143 int result;
5144 Py_ssize_t size;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005145
5146 /* Coerce the two arguments */
Fredrik Lundh833bf942006-05-23 10:12:21 +00005147 v = (PyUnicodeObject *) PyUnicode_FromObject(element);
5148 if (!v) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005149 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005150 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005151 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005152 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005153
5154 u = (PyUnicodeObject *) PyUnicode_FromObject(container);
5155 if (!u) {
5156 Py_DECREF(v);
5157 return -1;
5158 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005159
Barry Warsaw817918c2002-08-06 16:58:21 +00005160 size = PyUnicode_GET_SIZE(v);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005161 if (!size) {
5162 result = 1;
5163 goto done;
5164 }
Barry Warsaw817918c2002-08-06 16:58:21 +00005165
Guido van Rossum403d68b2000-03-13 15:55:09 +00005166 result = 0;
Fredrik Lundh833bf942006-05-23 10:12:21 +00005167
Barry Warsaw817918c2002-08-06 16:58:21 +00005168 if (size == 1) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00005169 Py_UNICODE chr = PyUnicode_AS_UNICODE(v)[0];
5170 Py_UNICODE* ptr = PyUnicode_AS_UNICODE(u);
5171 Py_UNICODE* end = ptr + PyUnicode_GET_SIZE(u);
5172 for (; ptr < end; ptr++) {
5173 if (*ptr == chr) {
Barry Warsaw817918c2002-08-06 16:58:21 +00005174 result = 1;
5175 break;
5176 }
5177 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005178 } else {
Fredrik Lundh240bf2a2006-05-24 10:20:36 +00005179 Py_ssize_t start = 0;
5180 Py_ssize_t end = PyUnicode_GET_SIZE(u) - size;
Fredrik Lundh833bf942006-05-23 10:12:21 +00005181 for (; start <= end; start++)
5182 if (Py_UNICODE_MATCH(u, start, v)) {
5183 result = 1;
5184 break;
5185 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005186 }
5187
Fredrik Lundh833bf942006-05-23 10:12:21 +00005188done:
Guido van Rossum403d68b2000-03-13 15:55:09 +00005189 Py_DECREF(u);
5190 Py_DECREF(v);
5191 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005192}
5193
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194/* Concat to string or Unicode object giving a new Unicode object. */
5195
5196PyObject *PyUnicode_Concat(PyObject *left,
5197 PyObject *right)
5198{
5199 PyUnicodeObject *u = NULL, *v = NULL, *w;
5200
5201 /* Coerce the two arguments */
5202 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5203 if (u == NULL)
5204 goto onError;
5205 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5206 if (v == NULL)
5207 goto onError;
5208
5209 /* Shortcuts */
5210 if (v == unicode_empty) {
5211 Py_DECREF(v);
5212 return (PyObject *)u;
5213 }
5214 if (u == unicode_empty) {
5215 Py_DECREF(u);
5216 return (PyObject *)v;
5217 }
5218
5219 /* Concat the two Unicode strings */
5220 w = _PyUnicode_New(u->length + v->length);
5221 if (w == NULL)
5222 goto onError;
5223 Py_UNICODE_COPY(w->str, u->str, u->length);
5224 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5225
5226 Py_DECREF(u);
5227 Py_DECREF(v);
5228 return (PyObject *)w;
5229
5230onError:
5231 Py_XDECREF(u);
5232 Py_XDECREF(v);
5233 return NULL;
5234}
5235
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005236PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237"S.count(sub[, start[, end]]) -> int\n\
5238\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005239Return the number of non-overlapping occurrences of substring sub in\n\
5240Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005241interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242
5243static PyObject *
5244unicode_count(PyUnicodeObject *self, PyObject *args)
5245{
5246 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005247 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005248 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249 PyObject *result;
5250
Guido van Rossumb8872e62000-05-09 14:14:27 +00005251 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5252 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253 return NULL;
5254
5255 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5256 (PyObject *)substring);
5257 if (substring == NULL)
5258 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005259
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260 if (start < 0)
5261 start += self->length;
5262 if (start < 0)
5263 start = 0;
5264 if (end > self->length)
5265 end = self->length;
5266 if (end < 0)
5267 end += self->length;
5268 if (end < 0)
5269 end = 0;
5270
5271 result = PyInt_FromLong((long) count(self, start, end, substring));
5272
5273 Py_DECREF(substring);
5274 return result;
5275}
5276
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005277PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005278"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005280Encodes S using the codec registered for encoding. encoding defaults\n\
5281to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005282handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005283a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5284'xmlcharrefreplace' as well as any other name registered with\n\
5285codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286
5287static PyObject *
5288unicode_encode(PyUnicodeObject *self, PyObject *args)
5289{
5290 char *encoding = NULL;
5291 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005292 PyObject *v;
5293
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5295 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005296 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005297 if (v == NULL)
5298 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005299 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5300 PyErr_Format(PyExc_TypeError,
5301 "encoder did not return a string/unicode object "
5302 "(type=%.400s)",
5303 v->ob_type->tp_name);
5304 Py_DECREF(v);
5305 return NULL;
5306 }
5307 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005308
5309 onError:
5310 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005311}
5312
5313PyDoc_STRVAR(decode__doc__,
5314"S.decode([encoding[,errors]]) -> string or unicode\n\
5315\n\
5316Decodes S using the codec registered for encoding. encoding defaults\n\
5317to the default encoding. errors may be given to set a different error\n\
5318handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5319a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5320as well as any other name registerd with codecs.register_error that is\n\
5321able to handle UnicodeDecodeErrors.");
5322
5323static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005324unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005325{
5326 char *encoding = NULL;
5327 char *errors = NULL;
5328 PyObject *v;
5329
5330 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5331 return NULL;
5332 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005333 if (v == NULL)
5334 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005335 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5336 PyErr_Format(PyExc_TypeError,
5337 "decoder did not return a string/unicode object "
5338 "(type=%.400s)",
5339 v->ob_type->tp_name);
5340 Py_DECREF(v);
5341 return NULL;
5342 }
5343 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005344
5345 onError:
5346 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347}
5348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005349PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350"S.expandtabs([tabsize]) -> unicode\n\
5351\n\
5352Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005353If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354
5355static PyObject*
5356unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5357{
5358 Py_UNICODE *e;
5359 Py_UNICODE *p;
5360 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005361 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 PyUnicodeObject *u;
5363 int tabsize = 8;
5364
5365 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5366 return NULL;
5367
Thomas Wouters7e474022000-07-16 12:04:32 +00005368 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 i = j = 0;
5370 e = self->str + self->length;
5371 for (p = self->str; p < e; p++)
5372 if (*p == '\t') {
5373 if (tabsize > 0)
5374 j += tabsize - (j % tabsize);
5375 }
5376 else {
5377 j++;
5378 if (*p == '\n' || *p == '\r') {
5379 i += j;
5380 j = 0;
5381 }
5382 }
5383
5384 /* Second pass: create output string and fill it */
5385 u = _PyUnicode_New(i + j);
5386 if (!u)
5387 return NULL;
5388
5389 j = 0;
5390 q = u->str;
5391
5392 for (p = self->str; p < e; p++)
5393 if (*p == '\t') {
5394 if (tabsize > 0) {
5395 i = tabsize - (j % tabsize);
5396 j += i;
5397 while (i--)
5398 *q++ = ' ';
5399 }
5400 }
5401 else {
5402 j++;
5403 *q++ = *p;
5404 if (*p == '\n' || *p == '\r')
5405 j = 0;
5406 }
5407
5408 return (PyObject*) u;
5409}
5410
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005411PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412"S.find(sub [,start [,end]]) -> int\n\
5413\n\
5414Return the lowest index in S where substring sub is found,\n\
5415such that sub is contained within s[start,end]. Optional\n\
5416arguments start and end are interpreted as in slice notation.\n\
5417\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005418Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419
5420static PyObject *
5421unicode_find(PyUnicodeObject *self, PyObject *args)
5422{
5423 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005424 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005425 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 PyObject *result;
5427
Guido van Rossumb8872e62000-05-09 14:14:27 +00005428 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5429 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 return NULL;
5431 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5432 (PyObject *)substring);
5433 if (substring == NULL)
5434 return NULL;
5435
Martin v. Löwis18e16552006-02-15 17:27:45 +00005436 result = PyInt_FromSsize_t(findstring(self, substring, start, end, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437
5438 Py_DECREF(substring);
5439 return result;
5440}
5441
5442static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005443unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444{
5445 if (index < 0 || index >= self->length) {
5446 PyErr_SetString(PyExc_IndexError, "string index out of range");
5447 return NULL;
5448 }
5449
5450 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5451}
5452
5453static long
5454unicode_hash(PyUnicodeObject *self)
5455{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005456 /* Since Unicode objects compare equal to their ASCII string
5457 counterparts, they should use the individual character values
5458 as basis for their hash value. This is needed to assure that
5459 strings and Unicode objects behave in the same way as
5460 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461
Martin v. Löwis18e16552006-02-15 17:27:45 +00005462 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005463 register Py_UNICODE *p;
5464 register long x;
5465
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466 if (self->hash != -1)
5467 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005468 len = PyUnicode_GET_SIZE(self);
5469 p = PyUnicode_AS_UNICODE(self);
5470 x = *p << 7;
5471 while (--len >= 0)
5472 x = (1000003*x) ^ *p++;
5473 x ^= PyUnicode_GET_SIZE(self);
5474 if (x == -1)
5475 x = -2;
5476 self->hash = x;
5477 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478}
5479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005480PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481"S.index(sub [,start [,end]]) -> int\n\
5482\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005483Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484
5485static PyObject *
5486unicode_index(PyUnicodeObject *self, PyObject *args)
5487{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005488 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005490 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005491 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492
Guido van Rossumb8872e62000-05-09 14:14:27 +00005493 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5494 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005496
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5498 (PyObject *)substring);
5499 if (substring == NULL)
5500 return NULL;
5501
5502 result = findstring(self, substring, start, end, 1);
5503
5504 Py_DECREF(substring);
5505 if (result < 0) {
5506 PyErr_SetString(PyExc_ValueError, "substring not found");
5507 return NULL;
5508 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005509 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510}
5511
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005512PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005513"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005515Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005516at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517
5518static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005519unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520{
5521 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5522 register const Py_UNICODE *e;
5523 int cased;
5524
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 /* Shortcut for single character strings */
5526 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005527 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005529 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005530 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005531 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005532
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533 e = p + PyUnicode_GET_SIZE(self);
5534 cased = 0;
5535 for (; p < e; p++) {
5536 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005537
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005539 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 else if (!cased && Py_UNICODE_ISLOWER(ch))
5541 cased = 1;
5542 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005543 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544}
5545
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005546PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005547"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005549Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005550at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551
5552static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005553unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554{
5555 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5556 register const Py_UNICODE *e;
5557 int cased;
5558
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 /* Shortcut for single character strings */
5560 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005561 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005563 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005564 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005565 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005566
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567 e = p + PyUnicode_GET_SIZE(self);
5568 cased = 0;
5569 for (; p < e; p++) {
5570 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005571
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005573 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 else if (!cased && Py_UNICODE_ISUPPER(ch))
5575 cased = 1;
5576 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005577 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578}
5579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005580PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005581"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005583Return True if S is a titlecased string and there is at least one\n\
5584character in S, i.e. upper- and titlecase characters may only\n\
5585follow uncased characters and lowercase characters only cased ones.\n\
5586Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587
5588static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005589unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590{
5591 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5592 register const Py_UNICODE *e;
5593 int cased, previous_is_cased;
5594
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 /* Shortcut for single character strings */
5596 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005597 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5598 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005600 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005601 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005602 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005603
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604 e = p + PyUnicode_GET_SIZE(self);
5605 cased = 0;
5606 previous_is_cased = 0;
5607 for (; p < e; p++) {
5608 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005609
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5611 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005612 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 previous_is_cased = 1;
5614 cased = 1;
5615 }
5616 else if (Py_UNICODE_ISLOWER(ch)) {
5617 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005618 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 previous_is_cased = 1;
5620 cased = 1;
5621 }
5622 else
5623 previous_is_cased = 0;
5624 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005625 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626}
5627
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005628PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005629"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005631Return True if all characters in S are whitespace\n\
5632and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633
5634static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005635unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636{
5637 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5638 register const Py_UNICODE *e;
5639
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 /* Shortcut for single character strings */
5641 if (PyUnicode_GET_SIZE(self) == 1 &&
5642 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005643 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005645 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005646 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005647 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005648
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 e = p + PyUnicode_GET_SIZE(self);
5650 for (; p < e; p++) {
5651 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005652 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005654 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655}
5656
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005657PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005658"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005659\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005660Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005661and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005662
5663static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005664unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005665{
5666 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5667 register const Py_UNICODE *e;
5668
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005669 /* Shortcut for single character strings */
5670 if (PyUnicode_GET_SIZE(self) == 1 &&
5671 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005672 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005673
5674 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005675 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005676 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005677
5678 e = p + PyUnicode_GET_SIZE(self);
5679 for (; p < e; p++) {
5680 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005681 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005682 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005683 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005684}
5685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005686PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005687"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005688\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005689Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005690and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005691
5692static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005693unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005694{
5695 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5696 register const Py_UNICODE *e;
5697
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005698 /* Shortcut for single character strings */
5699 if (PyUnicode_GET_SIZE(self) == 1 &&
5700 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005701 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005702
5703 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005704 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005705 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005706
5707 e = p + PyUnicode_GET_SIZE(self);
5708 for (; p < e; p++) {
5709 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005710 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005711 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005712 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005713}
5714
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005715PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005716"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005718Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005719False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720
5721static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005722unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723{
5724 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5725 register const Py_UNICODE *e;
5726
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 /* Shortcut for single character strings */
5728 if (PyUnicode_GET_SIZE(self) == 1 &&
5729 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005730 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005732 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005733 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005734 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005735
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 e = p + PyUnicode_GET_SIZE(self);
5737 for (; p < e; p++) {
5738 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005739 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005741 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742}
5743
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005744PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005745"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005747Return True if all characters in S are digits\n\
5748and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749
5750static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005751unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752{
5753 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5754 register const Py_UNICODE *e;
5755
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 /* Shortcut for single character strings */
5757 if (PyUnicode_GET_SIZE(self) == 1 &&
5758 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005759 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005761 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005762 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005763 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005764
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 e = p + PyUnicode_GET_SIZE(self);
5766 for (; p < e; p++) {
5767 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005768 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005770 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771}
5772
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005773PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005774"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005776Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005777False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778
5779static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005780unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781{
5782 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5783 register const Py_UNICODE *e;
5784
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785 /* Shortcut for single character strings */
5786 if (PyUnicode_GET_SIZE(self) == 1 &&
5787 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005788 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005790 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005791 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005792 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005793
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794 e = p + PyUnicode_GET_SIZE(self);
5795 for (; p < e; p++) {
5796 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005797 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005799 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800}
5801
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005802PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803"S.join(sequence) -> unicode\n\
5804\n\
5805Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005806sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807
5808static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005809unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005811 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812}
5813
Martin v. Löwis18e16552006-02-15 17:27:45 +00005814static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815unicode_length(PyUnicodeObject *self)
5816{
5817 return self->length;
5818}
5819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005820PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005821"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822\n\
5823Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005824done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825
5826static PyObject *
5827unicode_ljust(PyUnicodeObject *self, PyObject *args)
5828{
Martin v. Löwis412fb672006-04-13 06:34:32 +00005829 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005830 Py_UNICODE fillchar = ' ';
5831
Martin v. Löwis412fb672006-04-13 06:34:32 +00005832 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833 return NULL;
5834
Tim Peters7a29bd52001-09-12 03:03:31 +00005835 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 Py_INCREF(self);
5837 return (PyObject*) self;
5838 }
5839
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005840 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841}
5842
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005843PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844"S.lower() -> unicode\n\
5845\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005846Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847
5848static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005849unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 return fixup(self, fixlower);
5852}
5853
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005854#define LEFTSTRIP 0
5855#define RIGHTSTRIP 1
5856#define BOTHSTRIP 2
5857
5858/* Arrays indexed by above */
5859static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5860
5861#define STRIPNAME(i) (stripformat[i]+3)
5862
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005863/* externally visible for str.strip(unicode) */
5864PyObject *
5865_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5866{
5867 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005869 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005870 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
5871 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005872
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005873 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
5874
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005875 i = 0;
5876 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005877 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
5878 i++;
5879 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005880 }
5881
5882 j = len;
5883 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005884 do {
5885 j--;
5886 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
5887 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005888 }
5889
5890 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005891 Py_INCREF(self);
5892 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005893 }
5894 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005895 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005896}
5897
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898
5899static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005900do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005902 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005903 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005904
5905 i = 0;
5906 if (striptype != RIGHTSTRIP) {
5907 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5908 i++;
5909 }
5910 }
5911
5912 j = len;
5913 if (striptype != LEFTSTRIP) {
5914 do {
5915 j--;
5916 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5917 j++;
5918 }
5919
5920 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5921 Py_INCREF(self);
5922 return (PyObject*)self;
5923 }
5924 else
5925 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926}
5927
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005928
5929static PyObject *
5930do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5931{
5932 PyObject *sep = NULL;
5933
5934 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5935 return NULL;
5936
5937 if (sep != NULL && sep != Py_None) {
5938 if (PyUnicode_Check(sep))
5939 return _PyUnicode_XStrip(self, striptype, sep);
5940 else if (PyString_Check(sep)) {
5941 PyObject *res;
5942 sep = PyUnicode_FromObject(sep);
5943 if (sep==NULL)
5944 return NULL;
5945 res = _PyUnicode_XStrip(self, striptype, sep);
5946 Py_DECREF(sep);
5947 return res;
5948 }
5949 else {
5950 PyErr_Format(PyExc_TypeError,
5951 "%s arg must be None, unicode or str",
5952 STRIPNAME(striptype));
5953 return NULL;
5954 }
5955 }
5956
5957 return do_strip(self, striptype);
5958}
5959
5960
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005961PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005962"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005963\n\
5964Return a copy of the string S with leading and trailing\n\
5965whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005966If chars is given and not None, remove characters in chars instead.\n\
5967If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005968
5969static PyObject *
5970unicode_strip(PyUnicodeObject *self, PyObject *args)
5971{
5972 if (PyTuple_GET_SIZE(args) == 0)
5973 return do_strip(self, BOTHSTRIP); /* Common case */
5974 else
5975 return do_argstrip(self, BOTHSTRIP, args);
5976}
5977
5978
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005979PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005980"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005981\n\
5982Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005983If chars is given and not None, remove characters in chars instead.\n\
5984If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005985
5986static PyObject *
5987unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5988{
5989 if (PyTuple_GET_SIZE(args) == 0)
5990 return do_strip(self, LEFTSTRIP); /* Common case */
5991 else
5992 return do_argstrip(self, LEFTSTRIP, args);
5993}
5994
5995
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005996PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005997"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005998\n\
5999Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006000If chars is given and not None, remove characters in chars instead.\n\
6001If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006002
6003static PyObject *
6004unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6005{
6006 if (PyTuple_GET_SIZE(args) == 0)
6007 return do_strip(self, RIGHTSTRIP); /* Common case */
6008 else
6009 return do_argstrip(self, RIGHTSTRIP, args);
6010}
6011
6012
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006014unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015{
6016 PyUnicodeObject *u;
6017 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006018 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006019 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020
6021 if (len < 0)
6022 len = 0;
6023
Tim Peters7a29bd52001-09-12 03:03:31 +00006024 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 /* no repeat, return original string */
6026 Py_INCREF(str);
6027 return (PyObject*) str;
6028 }
Tim Peters8f422462000-09-09 06:13:41 +00006029
6030 /* ensure # of chars needed doesn't overflow int and # of bytes
6031 * needed doesn't overflow size_t
6032 */
6033 nchars = len * str->length;
6034 if (len && nchars / len != str->length) {
6035 PyErr_SetString(PyExc_OverflowError,
6036 "repeated string is too long");
6037 return NULL;
6038 }
6039 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6040 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6041 PyErr_SetString(PyExc_OverflowError,
6042 "repeated string is too long");
6043 return NULL;
6044 }
6045 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 if (!u)
6047 return NULL;
6048
6049 p = u->str;
6050
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006051 if (str->length == 1 && len > 0) {
6052 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006053 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006054 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006055 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006056 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006057 done = str->length;
6058 }
6059 while (done < nchars) {
6060 int n = (done <= nchars-done) ? done : nchars-done;
6061 Py_UNICODE_COPY(p+done, p, n);
6062 done += n;
6063 }
6064 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065
6066 return (PyObject*) u;
6067}
6068
6069PyObject *PyUnicode_Replace(PyObject *obj,
6070 PyObject *subobj,
6071 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006072 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073{
6074 PyObject *self;
6075 PyObject *str1;
6076 PyObject *str2;
6077 PyObject *result;
6078
6079 self = PyUnicode_FromObject(obj);
6080 if (self == NULL)
6081 return NULL;
6082 str1 = PyUnicode_FromObject(subobj);
6083 if (str1 == NULL) {
6084 Py_DECREF(self);
6085 return NULL;
6086 }
6087 str2 = PyUnicode_FromObject(replobj);
6088 if (str2 == NULL) {
6089 Py_DECREF(self);
6090 Py_DECREF(str1);
6091 return NULL;
6092 }
Tim Petersced69f82003-09-16 20:30:58 +00006093 result = replace((PyUnicodeObject *)self,
6094 (PyUnicodeObject *)str1,
6095 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 maxcount);
6097 Py_DECREF(self);
6098 Py_DECREF(str1);
6099 Py_DECREF(str2);
6100 return result;
6101}
6102
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006103PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104"S.replace (old, new[, maxsplit]) -> unicode\n\
6105\n\
6106Return a copy of S with all occurrences of substring\n\
6107old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006108given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109
6110static PyObject*
6111unicode_replace(PyUnicodeObject *self, PyObject *args)
6112{
6113 PyUnicodeObject *str1;
6114 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006115 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116 PyObject *result;
6117
Martin v. Löwis18e16552006-02-15 17:27:45 +00006118 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 return NULL;
6120 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6121 if (str1 == NULL)
6122 return NULL;
6123 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006124 if (str2 == NULL) {
6125 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006127 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128
6129 result = replace(self, str1, str2, maxcount);
6130
6131 Py_DECREF(str1);
6132 Py_DECREF(str2);
6133 return result;
6134}
6135
6136static
6137PyObject *unicode_repr(PyObject *unicode)
6138{
6139 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6140 PyUnicode_GET_SIZE(unicode),
6141 1);
6142}
6143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006144PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145"S.rfind(sub [,start [,end]]) -> int\n\
6146\n\
6147Return the highest index in S where substring sub is found,\n\
6148such that sub is contained within s[start,end]. Optional\n\
6149arguments start and end are interpreted as in slice notation.\n\
6150\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006151Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152
6153static PyObject *
6154unicode_rfind(PyUnicodeObject *self, PyObject *args)
6155{
6156 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006157 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006158 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 PyObject *result;
6160
Guido van Rossumb8872e62000-05-09 14:14:27 +00006161 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6162 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 return NULL;
6164 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6165 (PyObject *)substring);
6166 if (substring == NULL)
6167 return NULL;
6168
Martin v. Löwis18e16552006-02-15 17:27:45 +00006169 result = PyInt_FromSsize_t(findstring(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170
6171 Py_DECREF(substring);
6172 return result;
6173}
6174
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006175PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176"S.rindex(sub [,start [,end]]) -> int\n\
6177\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006178Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179
6180static PyObject *
6181unicode_rindex(PyUnicodeObject *self, PyObject *args)
6182{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006183 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006185 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006186 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187
Guido van Rossumb8872e62000-05-09 14:14:27 +00006188 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6189 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 return NULL;
6191 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6192 (PyObject *)substring);
6193 if (substring == NULL)
6194 return NULL;
6195
6196 result = findstring(self, substring, start, end, -1);
6197
6198 Py_DECREF(substring);
6199 if (result < 0) {
6200 PyErr_SetString(PyExc_ValueError, "substring not found");
6201 return NULL;
6202 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006203 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204}
6205
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006206PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006207"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208\n\
6209Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006210done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211
6212static PyObject *
6213unicode_rjust(PyUnicodeObject *self, PyObject *args)
6214{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006215 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006216 Py_UNICODE fillchar = ' ';
6217
Martin v. Löwis412fb672006-04-13 06:34:32 +00006218 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219 return NULL;
6220
Tim Peters7a29bd52001-09-12 03:03:31 +00006221 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222 Py_INCREF(self);
6223 return (PyObject*) self;
6224 }
6225
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006226 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227}
6228
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006230unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231{
6232 /* standard clamping */
6233 if (start < 0)
6234 start = 0;
6235 if (end < 0)
6236 end = 0;
6237 if (end > self->length)
6238 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006239 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 /* full slice, return original string */
6241 Py_INCREF(self);
6242 return (PyObject*) self;
6243 }
6244 if (start > end)
6245 start = end;
6246 /* copy slice */
6247 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6248 end - start);
6249}
6250
6251PyObject *PyUnicode_Split(PyObject *s,
6252 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006253 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254{
6255 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006256
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257 s = PyUnicode_FromObject(s);
6258 if (s == NULL)
6259 return NULL;
6260 if (sep != NULL) {
6261 sep = PyUnicode_FromObject(sep);
6262 if (sep == NULL) {
6263 Py_DECREF(s);
6264 return NULL;
6265 }
6266 }
6267
6268 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6269
6270 Py_DECREF(s);
6271 Py_XDECREF(sep);
6272 return result;
6273}
6274
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006275PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276"S.split([sep [,maxsplit]]) -> list of strings\n\
6277\n\
6278Return a list of the words in S, using sep as the\n\
6279delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006280splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006281any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282
6283static PyObject*
6284unicode_split(PyUnicodeObject *self, PyObject *args)
6285{
6286 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006287 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288
Martin v. Löwis18e16552006-02-15 17:27:45 +00006289 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 return NULL;
6291
6292 if (substring == Py_None)
6293 return split(self, NULL, maxcount);
6294 else if (PyUnicode_Check(substring))
6295 return split(self, (PyUnicodeObject *)substring, maxcount);
6296 else
6297 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6298}
6299
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006300PyObject *PyUnicode_RSplit(PyObject *s,
6301 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006302 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006303{
6304 PyObject *result;
6305
6306 s = PyUnicode_FromObject(s);
6307 if (s == NULL)
6308 return NULL;
6309 if (sep != NULL) {
6310 sep = PyUnicode_FromObject(sep);
6311 if (sep == NULL) {
6312 Py_DECREF(s);
6313 return NULL;
6314 }
6315 }
6316
6317 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6318
6319 Py_DECREF(s);
6320 Py_XDECREF(sep);
6321 return result;
6322}
6323
6324PyDoc_STRVAR(rsplit__doc__,
6325"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6326\n\
6327Return a list of the words in S, using sep as the\n\
6328delimiter string, starting at the end of the string and\n\
6329working to the front. If maxsplit is given, at most maxsplit\n\
6330splits are done. If sep is not specified, any whitespace string\n\
6331is a separator.");
6332
6333static PyObject*
6334unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6335{
6336 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006337 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006338
Martin v. Löwis18e16552006-02-15 17:27:45 +00006339 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006340 return NULL;
6341
6342 if (substring == Py_None)
6343 return rsplit(self, NULL, maxcount);
6344 else if (PyUnicode_Check(substring))
6345 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6346 else
6347 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6348}
6349
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006350PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006351"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352\n\
6353Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006354Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006355is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356
6357static PyObject*
6358unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6359{
Guido van Rossum86662912000-04-11 15:38:46 +00006360 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361
Guido van Rossum86662912000-04-11 15:38:46 +00006362 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363 return NULL;
6364
Guido van Rossum86662912000-04-11 15:38:46 +00006365 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366}
6367
6368static
6369PyObject *unicode_str(PyUnicodeObject *self)
6370{
Fred Drakee4315f52000-05-09 19:53:39 +00006371 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372}
6373
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006374PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375"S.swapcase() -> unicode\n\
6376\n\
6377Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006378and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379
6380static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006381unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383 return fixup(self, fixswapcase);
6384}
6385
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006386PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387"S.translate(table) -> unicode\n\
6388\n\
6389Return a copy of the string S, where all characters have been mapped\n\
6390through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006391Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6392Unmapped characters are left untouched. Characters mapped to None\n\
6393are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394
6395static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006396unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397{
Tim Petersced69f82003-09-16 20:30:58 +00006398 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006400 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 "ignore");
6402}
6403
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006404PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405"S.upper() -> unicode\n\
6406\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006407Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408
6409static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006410unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 return fixup(self, fixupper);
6413}
6414
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006415PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416"S.zfill(width) -> unicode\n\
6417\n\
6418Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006419of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420
6421static PyObject *
6422unicode_zfill(PyUnicodeObject *self, PyObject *args)
6423{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006424 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425 PyUnicodeObject *u;
6426
Martin v. Löwis18e16552006-02-15 17:27:45 +00006427 Py_ssize_t width;
6428 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 return NULL;
6430
6431 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006432 if (PyUnicode_CheckExact(self)) {
6433 Py_INCREF(self);
6434 return (PyObject*) self;
6435 }
6436 else
6437 return PyUnicode_FromUnicode(
6438 PyUnicode_AS_UNICODE(self),
6439 PyUnicode_GET_SIZE(self)
6440 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 }
6442
6443 fill = width - self->length;
6444
6445 u = pad(self, fill, 0, '0');
6446
Walter Dörwald068325e2002-04-15 13:36:47 +00006447 if (u == NULL)
6448 return NULL;
6449
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 if (u->str[fill] == '+' || u->str[fill] == '-') {
6451 /* move sign to beginning of string */
6452 u->str[0] = u->str[fill];
6453 u->str[fill] = '0';
6454 }
6455
6456 return (PyObject*) u;
6457}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458
6459#if 0
6460static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006461unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 return PyInt_FromLong(unicode_freelist_size);
6464}
6465#endif
6466
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006467PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006468"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006470Return True if S starts with the specified prefix, False otherwise.\n\
6471With optional start, test S beginning at that position.\n\
6472With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473
6474static PyObject *
6475unicode_startswith(PyUnicodeObject *self,
6476 PyObject *args)
6477{
6478 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006479 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006480 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 PyObject *result;
6482
Guido van Rossumb8872e62000-05-09 14:14:27 +00006483 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6484 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 return NULL;
6486 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6487 (PyObject *)substring);
6488 if (substring == NULL)
6489 return NULL;
6490
Guido van Rossum77f6a652002-04-03 22:41:51 +00006491 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492
6493 Py_DECREF(substring);
6494 return result;
6495}
6496
6497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006498PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006499"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006501Return True if S ends with the specified suffix, False otherwise.\n\
6502With optional start, test S beginning at that position.\n\
6503With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504
6505static PyObject *
6506unicode_endswith(PyUnicodeObject *self,
6507 PyObject *args)
6508{
6509 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006510 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006511 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512 PyObject *result;
6513
Guido van Rossumb8872e62000-05-09 14:14:27 +00006514 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6515 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 return NULL;
6517 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6518 (PyObject *)substring);
6519 if (substring == NULL)
6520 return NULL;
6521
Guido van Rossum77f6a652002-04-03 22:41:51 +00006522 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523
6524 Py_DECREF(substring);
6525 return result;
6526}
6527
6528
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006529
6530static PyObject *
6531unicode_getnewargs(PyUnicodeObject *v)
6532{
6533 return Py_BuildValue("(u#)", v->str, v->length);
6534}
6535
6536
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537static PyMethodDef unicode_methods[] = {
6538
6539 /* Order is according to common usage: often used methods should
6540 appear first, since lookup is done sequentially. */
6541
Georg Brandlecdc0a92006-03-30 12:19:07 +00006542 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006543 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6544 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006545 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006546 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6547 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6548 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6549 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6550 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6551 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6552 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6553 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6554 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6555 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006556 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006557 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006558/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6559 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6560 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6561 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006562 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006563 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006564 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006565 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6566 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6567 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6568 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6569 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6570 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6571 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6572 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6573 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6574 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6575 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6576 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6577 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6578 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006579 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006580#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006581 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582#endif
6583
6584#if 0
6585 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006586 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587#endif
6588
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006589 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590 {NULL, NULL}
6591};
6592
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006593static PyObject *
6594unicode_mod(PyObject *v, PyObject *w)
6595{
6596 if (!PyUnicode_Check(v)) {
6597 Py_INCREF(Py_NotImplemented);
6598 return Py_NotImplemented;
6599 }
6600 return PyUnicode_Format(v, w);
6601}
6602
6603static PyNumberMethods unicode_as_number = {
6604 0, /*nb_add*/
6605 0, /*nb_subtract*/
6606 0, /*nb_multiply*/
6607 0, /*nb_divide*/
6608 unicode_mod, /*nb_remainder*/
6609};
6610
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006612 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00006613 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006614 (ssizeargfunc) unicode_repeat, /* sq_repeat */
6615 (ssizeargfunc) unicode_getitem, /* sq_item */
6616 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 0, /* sq_ass_item */
6618 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00006619 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620};
6621
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006622#define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
6623
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006624static PyObject*
6625unicode_subscript(PyUnicodeObject* self, PyObject* item)
6626{
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006627 PyNumberMethods *nb = item->ob_type->tp_as_number;
6628 if (nb != NULL && HASINDEX(item) && nb->nb_index != NULL) {
6629 Py_ssize_t i = nb->nb_index(item);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006630 if (i == -1 && PyErr_Occurred())
6631 return NULL;
6632 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006633 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006634 return unicode_getitem(self, i);
6635 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006636 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006637 Py_UNICODE* source_buf;
6638 Py_UNICODE* result_buf;
6639 PyObject* result;
6640
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006641 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006642 &start, &stop, &step, &slicelength) < 0) {
6643 return NULL;
6644 }
6645
6646 if (slicelength <= 0) {
6647 return PyUnicode_FromUnicode(NULL, 0);
6648 } else {
6649 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00006650 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
6651 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006652
6653 if (result_buf == NULL)
6654 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006655
6656 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6657 result_buf[i] = source_buf[cur];
6658 }
Tim Petersced69f82003-09-16 20:30:58 +00006659
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006660 result = PyUnicode_FromUnicode(result_buf, slicelength);
6661 PyMem_FREE(result_buf);
6662 return result;
6663 }
6664 } else {
6665 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6666 return NULL;
6667 }
6668}
6669
6670static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006671 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006672 (binaryfunc)unicode_subscript, /* mp_subscript */
6673 (objobjargproc)0, /* mp_ass_subscript */
6674};
6675
Martin v. Löwis18e16552006-02-15 17:27:45 +00006676static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006678 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 const void **ptr)
6680{
6681 if (index != 0) {
6682 PyErr_SetString(PyExc_SystemError,
6683 "accessing non-existent unicode segment");
6684 return -1;
6685 }
6686 *ptr = (void *) self->str;
6687 return PyUnicode_GET_DATA_SIZE(self);
6688}
6689
Martin v. Löwis18e16552006-02-15 17:27:45 +00006690static Py_ssize_t
6691unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692 const void **ptr)
6693{
6694 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006695 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 return -1;
6697}
6698
6699static int
6700unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006701 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702{
6703 if (lenp)
6704 *lenp = PyUnicode_GET_DATA_SIZE(self);
6705 return 1;
6706}
6707
Martin v. Löwiseb079f12006-02-16 14:32:27 +00006708static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006710 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 const void **ptr)
6712{
6713 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006714
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715 if (index != 0) {
6716 PyErr_SetString(PyExc_SystemError,
6717 "accessing non-existent unicode segment");
6718 return -1;
6719 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006720 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721 if (str == NULL)
6722 return -1;
6723 *ptr = (void *) PyString_AS_STRING(str);
6724 return PyString_GET_SIZE(str);
6725}
6726
6727/* Helpers for PyUnicode_Format() */
6728
6729static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006730getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006732 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733 if (argidx < arglen) {
6734 (*p_argidx)++;
6735 if (arglen < 0)
6736 return args;
6737 else
6738 return PyTuple_GetItem(args, argidx);
6739 }
6740 PyErr_SetString(PyExc_TypeError,
6741 "not enough arguments for format string");
6742 return NULL;
6743}
6744
6745#define F_LJUST (1<<0)
6746#define F_SIGN (1<<1)
6747#define F_BLANK (1<<2)
6748#define F_ALT (1<<3)
6749#define F_ZERO (1<<4)
6750
Martin v. Löwis18e16552006-02-15 17:27:45 +00006751static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00006752strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006754 register Py_ssize_t i;
6755 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 for (i = len - 1; i >= 0; i--)
6757 buffer[i] = (Py_UNICODE) charbuffer[i];
6758
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759 return len;
6760}
6761
Neal Norwitzfc76d632006-01-10 06:03:13 +00006762static int
6763doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
6764{
Tim Peters15231542006-02-16 01:08:01 +00006765 Py_ssize_t result;
6766
Neal Norwitzfc76d632006-01-10 06:03:13 +00006767 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006768 result = strtounicode(buffer, (char *)buffer);
6769 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006770}
6771
6772static int
6773longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
6774{
Tim Peters15231542006-02-16 01:08:01 +00006775 Py_ssize_t result;
6776
Neal Norwitzfc76d632006-01-10 06:03:13 +00006777 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006778 result = strtounicode(buffer, (char *)buffer);
6779 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006780}
6781
Guido van Rossum078151d2002-08-11 04:24:12 +00006782/* XXX To save some code duplication, formatfloat/long/int could have been
6783 shared with stringobject.c, converting from 8-bit to Unicode after the
6784 formatting is done. */
6785
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786static int
6787formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006788 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 int flags,
6790 int prec,
6791 int type,
6792 PyObject *v)
6793{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006794 /* fmt = '%#.' + `prec` + `type`
6795 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796 char fmt[20];
6797 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006798
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799 x = PyFloat_AsDouble(v);
6800 if (x == -1.0 && PyErr_Occurred())
6801 return -1;
6802 if (prec < 0)
6803 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6805 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006806 /* Worst case length calc to ensure no buffer overrun:
6807
6808 'g' formats:
6809 fmt = %#.<prec>g
6810 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6811 for any double rep.)
6812 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6813
6814 'f' formats:
6815 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6816 len = 1 + 50 + 1 + prec = 52 + prec
6817
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006818 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006819 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006820
6821 */
6822 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6823 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006824 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006825 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006826 return -1;
6827 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006828 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6829 (flags&F_ALT) ? "#" : "",
6830 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006831 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832}
6833
Tim Peters38fd5b62000-09-21 05:43:11 +00006834static PyObject*
6835formatlong(PyObject *val, int flags, int prec, int type)
6836{
6837 char *buf;
6838 int i, len;
6839 PyObject *str; /* temporary string object. */
6840 PyUnicodeObject *result;
6841
6842 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6843 if (!str)
6844 return NULL;
6845 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006846 if (!result) {
6847 Py_DECREF(str);
6848 return NULL;
6849 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006850 for (i = 0; i < len; i++)
6851 result->str[i] = buf[i];
6852 result->str[len] = 0;
6853 Py_DECREF(str);
6854 return (PyObject*)result;
6855}
6856
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857static int
6858formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006859 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 int flags,
6861 int prec,
6862 int type,
6863 PyObject *v)
6864{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006865 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006866 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6867 * + 1 + 1
6868 * = 24
6869 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006870 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006871 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872 long x;
6873
6874 x = PyInt_AsLong(v);
6875 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006876 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006877 if (x < 0 && type == 'u') {
6878 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006879 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006880 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6881 sign = "-";
6882 else
6883 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006885 prec = 1;
6886
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006887 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6888 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006889 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006890 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006891 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006892 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006893 return -1;
6894 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006895
6896 if ((flags & F_ALT) &&
6897 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006898 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006899 * of issues that cause pain:
6900 * - when 0 is being converted, the C standard leaves off
6901 * the '0x' or '0X', which is inconsistent with other
6902 * %#x/%#X conversions and inconsistent with Python's
6903 * hex() function
6904 * - there are platforms that violate the standard and
6905 * convert 0 with the '0x' or '0X'
6906 * (Metrowerks, Compaq Tru64)
6907 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006908 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006909 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006910 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006911 * We can achieve the desired consistency by inserting our
6912 * own '0x' or '0X' prefix, and substituting %x/%X in place
6913 * of %#x/%#X.
6914 *
6915 * Note that this is the same approach as used in
6916 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006917 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006918 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6919 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006920 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006921 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006922 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6923 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006924 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006925 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006926 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00006927 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006928 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00006929 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930}
6931
6932static int
6933formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006934 size_t buflen,
6935 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006937 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006938 if (PyUnicode_Check(v)) {
6939 if (PyUnicode_GET_SIZE(v) != 1)
6940 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006942 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006944 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006945 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006946 goto onError;
6947 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6948 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949
6950 else {
6951 /* Integer input truncated to a character */
6952 long x;
6953 x = PyInt_AsLong(v);
6954 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006955 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006956#ifdef Py_UNICODE_WIDE
6957 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006958 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006959 "%c arg not in range(0x110000) "
6960 "(wide Python build)");
6961 return -1;
6962 }
6963#else
6964 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006965 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006966 "%c arg not in range(0x10000) "
6967 "(narrow Python build)");
6968 return -1;
6969 }
6970#endif
6971 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 }
6973 buf[1] = '\0';
6974 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006975
6976 onError:
6977 PyErr_SetString(PyExc_TypeError,
6978 "%c requires int or char");
6979 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980}
6981
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006982/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6983
6984 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6985 chars are formatted. XXX This is a magic number. Each formatting
6986 routine does bounds checking to ensure no overflow, but a better
6987 solution may be to malloc a buffer of appropriate size for each
6988 format. For now, the current solution is sufficient.
6989*/
6990#define FORMATBUFLEN (size_t)120
6991
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992PyObject *PyUnicode_Format(PyObject *format,
6993 PyObject *args)
6994{
6995 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006996 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997 int args_owned = 0;
6998 PyUnicodeObject *result = NULL;
6999 PyObject *dict = NULL;
7000 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007001
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 if (format == NULL || args == NULL) {
7003 PyErr_BadInternalCall();
7004 return NULL;
7005 }
7006 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007007 if (uformat == NULL)
7008 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009 fmt = PyUnicode_AS_UNICODE(uformat);
7010 fmtcnt = PyUnicode_GET_SIZE(uformat);
7011
7012 reslen = rescnt = fmtcnt + 100;
7013 result = _PyUnicode_New(reslen);
7014 if (result == NULL)
7015 goto onError;
7016 res = PyUnicode_AS_UNICODE(result);
7017
7018 if (PyTuple_Check(args)) {
7019 arglen = PyTuple_Size(args);
7020 argidx = 0;
7021 }
7022 else {
7023 arglen = -1;
7024 argidx = -2;
7025 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007026 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7027 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028 dict = args;
7029
7030 while (--fmtcnt >= 0) {
7031 if (*fmt != '%') {
7032 if (--rescnt < 0) {
7033 rescnt = fmtcnt + 100;
7034 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007035 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007036 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7038 --rescnt;
7039 }
7040 *res++ = *fmt++;
7041 }
7042 else {
7043 /* Got a format specifier */
7044 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007045 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047 Py_UNICODE c = '\0';
7048 Py_UNICODE fill;
7049 PyObject *v = NULL;
7050 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007051 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007053 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007054 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055
7056 fmt++;
7057 if (*fmt == '(') {
7058 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007059 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060 PyObject *key;
7061 int pcount = 1;
7062
7063 if (dict == NULL) {
7064 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007065 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 goto onError;
7067 }
7068 ++fmt;
7069 --fmtcnt;
7070 keystart = fmt;
7071 /* Skip over balanced parentheses */
7072 while (pcount > 0 && --fmtcnt >= 0) {
7073 if (*fmt == ')')
7074 --pcount;
7075 else if (*fmt == '(')
7076 ++pcount;
7077 fmt++;
7078 }
7079 keylen = fmt - keystart - 1;
7080 if (fmtcnt < 0 || pcount > 0) {
7081 PyErr_SetString(PyExc_ValueError,
7082 "incomplete format key");
7083 goto onError;
7084 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007085#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007086 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087 then looked up since Python uses strings to hold
7088 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007089 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090 key = PyUnicode_EncodeUTF8(keystart,
7091 keylen,
7092 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007093#else
7094 key = PyUnicode_FromUnicode(keystart, keylen);
7095#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096 if (key == NULL)
7097 goto onError;
7098 if (args_owned) {
7099 Py_DECREF(args);
7100 args_owned = 0;
7101 }
7102 args = PyObject_GetItem(dict, key);
7103 Py_DECREF(key);
7104 if (args == NULL) {
7105 goto onError;
7106 }
7107 args_owned = 1;
7108 arglen = -1;
7109 argidx = -2;
7110 }
7111 while (--fmtcnt >= 0) {
7112 switch (c = *fmt++) {
7113 case '-': flags |= F_LJUST; continue;
7114 case '+': flags |= F_SIGN; continue;
7115 case ' ': flags |= F_BLANK; continue;
7116 case '#': flags |= F_ALT; continue;
7117 case '0': flags |= F_ZERO; continue;
7118 }
7119 break;
7120 }
7121 if (c == '*') {
7122 v = getnextarg(args, arglen, &argidx);
7123 if (v == NULL)
7124 goto onError;
7125 if (!PyInt_Check(v)) {
7126 PyErr_SetString(PyExc_TypeError,
7127 "* wants int");
7128 goto onError;
7129 }
7130 width = PyInt_AsLong(v);
7131 if (width < 0) {
7132 flags |= F_LJUST;
7133 width = -width;
7134 }
7135 if (--fmtcnt >= 0)
7136 c = *fmt++;
7137 }
7138 else if (c >= '0' && c <= '9') {
7139 width = c - '0';
7140 while (--fmtcnt >= 0) {
7141 c = *fmt++;
7142 if (c < '0' || c > '9')
7143 break;
7144 if ((width*10) / 10 != width) {
7145 PyErr_SetString(PyExc_ValueError,
7146 "width too big");
7147 goto onError;
7148 }
7149 width = width*10 + (c - '0');
7150 }
7151 }
7152 if (c == '.') {
7153 prec = 0;
7154 if (--fmtcnt >= 0)
7155 c = *fmt++;
7156 if (c == '*') {
7157 v = getnextarg(args, arglen, &argidx);
7158 if (v == NULL)
7159 goto onError;
7160 if (!PyInt_Check(v)) {
7161 PyErr_SetString(PyExc_TypeError,
7162 "* wants int");
7163 goto onError;
7164 }
7165 prec = PyInt_AsLong(v);
7166 if (prec < 0)
7167 prec = 0;
7168 if (--fmtcnt >= 0)
7169 c = *fmt++;
7170 }
7171 else if (c >= '0' && c <= '9') {
7172 prec = c - '0';
7173 while (--fmtcnt >= 0) {
7174 c = Py_CHARMASK(*fmt++);
7175 if (c < '0' || c > '9')
7176 break;
7177 if ((prec*10) / 10 != prec) {
7178 PyErr_SetString(PyExc_ValueError,
7179 "prec too big");
7180 goto onError;
7181 }
7182 prec = prec*10 + (c - '0');
7183 }
7184 }
7185 } /* prec */
7186 if (fmtcnt >= 0) {
7187 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188 if (--fmtcnt >= 0)
7189 c = *fmt++;
7190 }
7191 }
7192 if (fmtcnt < 0) {
7193 PyErr_SetString(PyExc_ValueError,
7194 "incomplete format");
7195 goto onError;
7196 }
7197 if (c != '%') {
7198 v = getnextarg(args, arglen, &argidx);
7199 if (v == NULL)
7200 goto onError;
7201 }
7202 sign = 0;
7203 fill = ' ';
7204 switch (c) {
7205
7206 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007207 pbuf = formatbuf;
7208 /* presume that buffer length is at least 1 */
7209 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210 len = 1;
7211 break;
7212
7213 case 's':
7214 case 'r':
7215 if (PyUnicode_Check(v) && c == 's') {
7216 temp = v;
7217 Py_INCREF(temp);
7218 }
7219 else {
7220 PyObject *unicode;
7221 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007222 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223 else
7224 temp = PyObject_Repr(v);
7225 if (temp == NULL)
7226 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007227 if (PyUnicode_Check(temp))
7228 /* nothing to do */;
7229 else if (PyString_Check(temp)) {
7230 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007231 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007233 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007235 Py_DECREF(temp);
7236 temp = unicode;
7237 if (temp == NULL)
7238 goto onError;
7239 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007240 else {
7241 Py_DECREF(temp);
7242 PyErr_SetString(PyExc_TypeError,
7243 "%s argument has non-string str()");
7244 goto onError;
7245 }
7246 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007247 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248 len = PyUnicode_GET_SIZE(temp);
7249 if (prec >= 0 && len > prec)
7250 len = prec;
7251 break;
7252
7253 case 'i':
7254 case 'd':
7255 case 'u':
7256 case 'o':
7257 case 'x':
7258 case 'X':
7259 if (c == 'i')
7260 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007261 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007262 temp = formatlong(v, flags, prec, c);
7263 if (!temp)
7264 goto onError;
7265 pbuf = PyUnicode_AS_UNICODE(temp);
7266 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007267 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007269 else {
7270 pbuf = formatbuf;
7271 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7272 flags, prec, c, v);
7273 if (len < 0)
7274 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007275 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007276 }
7277 if (flags & F_ZERO)
7278 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279 break;
7280
7281 case 'e':
7282 case 'E':
7283 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007284 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285 case 'g':
7286 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007287 if (c == 'F')
7288 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007289 pbuf = formatbuf;
7290 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7291 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292 if (len < 0)
7293 goto onError;
7294 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007295 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296 fill = '0';
7297 break;
7298
7299 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007300 pbuf = formatbuf;
7301 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 if (len < 0)
7303 goto onError;
7304 break;
7305
7306 default:
7307 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007308 "unsupported format character '%c' (0x%x) "
7309 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007310 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007311 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007312 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313 goto onError;
7314 }
7315 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007316 if (*pbuf == '-' || *pbuf == '+') {
7317 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318 len--;
7319 }
7320 else if (flags & F_SIGN)
7321 sign = '+';
7322 else if (flags & F_BLANK)
7323 sign = ' ';
7324 else
7325 sign = 0;
7326 }
7327 if (width < len)
7328 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007329 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 reslen -= rescnt;
7331 rescnt = width + fmtcnt + 100;
7332 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007333 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007334 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007335 PyErr_NoMemory();
7336 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007337 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007338 if (_PyUnicode_Resize(&result, reslen) < 0) {
7339 Py_XDECREF(temp);
7340 goto onError;
7341 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342 res = PyUnicode_AS_UNICODE(result)
7343 + reslen - rescnt;
7344 }
7345 if (sign) {
7346 if (fill != ' ')
7347 *res++ = sign;
7348 rescnt--;
7349 if (width > len)
7350 width--;
7351 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007352 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7353 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007354 assert(pbuf[1] == c);
7355 if (fill != ' ') {
7356 *res++ = *pbuf++;
7357 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007358 }
Tim Petersfff53252001-04-12 18:38:48 +00007359 rescnt -= 2;
7360 width -= 2;
7361 if (width < 0)
7362 width = 0;
7363 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007364 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365 if (width > len && !(flags & F_LJUST)) {
7366 do {
7367 --rescnt;
7368 *res++ = fill;
7369 } while (--width > len);
7370 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007371 if (fill == ' ') {
7372 if (sign)
7373 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007374 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007375 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007376 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007377 *res++ = *pbuf++;
7378 *res++ = *pbuf++;
7379 }
7380 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007381 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382 res += len;
7383 rescnt -= len;
7384 while (--width >= len) {
7385 --rescnt;
7386 *res++ = ' ';
7387 }
7388 if (dict && (argidx < arglen) && c != '%') {
7389 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007390 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007391 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392 goto onError;
7393 }
7394 Py_XDECREF(temp);
7395 } /* '%' */
7396 } /* until end */
7397 if (argidx < arglen && !dict) {
7398 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007399 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400 goto onError;
7401 }
7402
Thomas Woutersa96affe2006-03-12 00:29:36 +00007403 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7404 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405 if (args_owned) {
7406 Py_DECREF(args);
7407 }
7408 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409 return (PyObject *)result;
7410
7411 onError:
7412 Py_XDECREF(result);
7413 Py_DECREF(uformat);
7414 if (args_owned) {
7415 Py_DECREF(args);
7416 }
7417 return NULL;
7418}
7419
7420static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007421 (readbufferproc) unicode_buffer_getreadbuf,
7422 (writebufferproc) unicode_buffer_getwritebuf,
7423 (segcountproc) unicode_buffer_getsegcount,
7424 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425};
7426
Jeremy Hylton938ace62002-07-17 16:30:39 +00007427static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007428unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7429
Tim Peters6d6c1a32001-08-02 04:15:00 +00007430static PyObject *
7431unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7432{
7433 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007434 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007435 char *encoding = NULL;
7436 char *errors = NULL;
7437
Guido van Rossume023fe02001-08-30 03:12:59 +00007438 if (type != &PyUnicode_Type)
7439 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007440 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7441 kwlist, &x, &encoding, &errors))
7442 return NULL;
7443 if (x == NULL)
7444 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007445 if (encoding == NULL && errors == NULL)
7446 return PyObject_Unicode(x);
7447 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007448 return PyUnicode_FromEncodedObject(x, encoding, errors);
7449}
7450
Guido van Rossume023fe02001-08-30 03:12:59 +00007451static PyObject *
7452unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7453{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007454 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007455 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007456
7457 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7458 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7459 if (tmp == NULL)
7460 return NULL;
7461 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007462 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007463 if (pnew == NULL) {
7464 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007465 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007466 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007467 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7468 if (pnew->str == NULL) {
7469 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007470 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007471 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007472 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007473 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007474 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7475 pnew->length = n;
7476 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007477 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007478 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007479}
7480
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007481PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007482"unicode(string [, encoding[, errors]]) -> object\n\
7483\n\
7484Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007485encoding defaults to the current default string encoding.\n\
7486errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007487
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488PyTypeObject PyUnicode_Type = {
7489 PyObject_HEAD_INIT(&PyType_Type)
7490 0, /* ob_size */
7491 "unicode", /* tp_name */
7492 sizeof(PyUnicodeObject), /* tp_size */
7493 0, /* tp_itemsize */
7494 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007495 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007497 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498 0, /* tp_setattr */
7499 (cmpfunc) unicode_compare, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007500 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007501 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007503 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504 (hashfunc) unicode_hash, /* tp_hash*/
7505 0, /* tp_call*/
7506 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007507 PyObject_GenericGetAttr, /* tp_getattro */
7508 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007510 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7511 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007512 unicode_doc, /* tp_doc */
7513 0, /* tp_traverse */
7514 0, /* tp_clear */
7515 0, /* tp_richcompare */
7516 0, /* tp_weaklistoffset */
7517 0, /* tp_iter */
7518 0, /* tp_iternext */
7519 unicode_methods, /* tp_methods */
7520 0, /* tp_members */
7521 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007522 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007523 0, /* tp_dict */
7524 0, /* tp_descr_get */
7525 0, /* tp_descr_set */
7526 0, /* tp_dictoffset */
7527 0, /* tp_init */
7528 0, /* tp_alloc */
7529 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007530 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531};
7532
7533/* Initialize the Unicode implementation */
7534
Thomas Wouters78890102000-07-22 19:25:51 +00007535void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007537 int i;
7538
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007539 /* XXX - move this array to unicodectype.c ? */
7540 Py_UNICODE linebreak[] = {
7541 0x000A, /* LINE FEED */
7542 0x000D, /* CARRIAGE RETURN */
7543 0x001C, /* FILE SEPARATOR */
7544 0x001D, /* GROUP SEPARATOR */
7545 0x001E, /* RECORD SEPARATOR */
7546 0x0085, /* NEXT LINE */
7547 0x2028, /* LINE SEPARATOR */
7548 0x2029, /* PARAGRAPH SEPARATOR */
7549 };
7550
Fred Drakee4315f52000-05-09 19:53:39 +00007551 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007552 unicode_freelist = NULL;
7553 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007555 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007556 for (i = 0; i < 256; i++)
7557 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007558 if (PyType_Ready(&PyUnicode_Type) < 0)
7559 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007560
7561 /* initialize the linebreak bloom filter */
7562 bloom_linebreak = make_bloom_mask(
7563 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
7564 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565}
7566
7567/* Finalize the Unicode implementation */
7568
7569void
Thomas Wouters78890102000-07-22 19:25:51 +00007570_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007572 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007573 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007575 Py_XDECREF(unicode_empty);
7576 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007577
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007578 for (i = 0; i < 256; i++) {
7579 if (unicode_latin1[i]) {
7580 Py_DECREF(unicode_latin1[i]);
7581 unicode_latin1[i] = NULL;
7582 }
7583 }
7584
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007585 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 PyUnicodeObject *v = u;
7587 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007588 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007589 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007590 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007591 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007593 unicode_freelist = NULL;
7594 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007596
Anthony Baxterac6bd462006-04-13 02:06:09 +00007597#ifdef __cplusplus
7598}
7599#endif
7600
7601
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007602/*
7603Local variables:
7604c-basic-offset: 4
7605indent-tabs-mode: nil
7606End:
7607*/