blob: aff14f593aa12f033a2ab7c7c34c844393ede6f7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
Martin v. Löwis5cb69362006-04-14 09:08:42 +000039#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000040#include "Python.h"
41
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Fredrik Lundhb63588c2006-05-23 18:44:25 +000049#undef USE_INLINE /* XXX - set via configure? */
50
51#if defined(_MSC_VER) /* this is taken from _sre.c */
52#pragma warning(disable: 4710)
53/* fastest possible local call under MSVC */
54#define LOCAL(type) static __inline type __fastcall
55#elif defined(USE_INLINE)
56#define LOCAL(type) static inline type
57#else
58#define LOCAL(type) static type
59#endif
60
Guido van Rossumd57fd912000-03-10 22:53:23 +000061/* Limit for the Unicode object free list */
62
63#define MAX_UNICODE_FREELIST_SIZE 1024
64
65/* Limit for the Unicode object free list stay alive optimization.
66
67 The implementation will keep allocated Unicode memory intact for
68 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000069 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
Barry Warsaw51ac5802000-03-20 16:36:48 +000071 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000072 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000073 malloc()-overhead) bytes of unused garbage.
74
75 Setting the limit to 0 effectively turns the feature off.
76
Guido van Rossumfd4b9572000-04-10 13:51:10 +000077 Note: This is an experimental feature ! If you get core dumps when
78 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000079
80*/
81
Guido van Rossumfd4b9572000-04-10 13:51:10 +000082#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000083
84/* Endianness switches; defaults to little endian */
85
86#ifdef WORDS_BIGENDIAN
87# define BYTEORDER_IS_BIG_ENDIAN
88#else
89# define BYTEORDER_IS_LITTLE_ENDIAN
90#endif
91
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000092/* --- Globals ------------------------------------------------------------
93
94 The globals are initialized by the _PyUnicode_Init() API and should
95 not be used before calling that API.
96
97*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Anthony Baxterac6bd462006-04-13 02:06:09 +000099
100#ifdef __cplusplus
101extern "C" {
102#endif
103
Guido van Rossumd57fd912000-03-10 22:53:23 +0000104/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000105static PyUnicodeObject *unicode_freelist;
106static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000107
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000108/* The empty Unicode object is shared to improve performance. */
109static PyUnicodeObject *unicode_empty;
110
111/* Single character Unicode strings in the Latin-1 range are being
112 shared as well. */
113static PyUnicodeObject *unicode_latin1[256];
114
Fred Drakee4315f52000-05-09 19:53:39 +0000115/* Default encoding to use and assume when NULL is passed as encoding
116 parameter; it is initialized by _PyUnicode_Init().
117
118 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000119 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000120
121*/
Fred Drakee4315f52000-05-09 19:53:39 +0000122static char unicode_default_encoding[100];
123
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000124Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000125PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000126{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000127#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000128 return 0x10FFFF;
129#else
130 /* This is actually an illegal character, so it should
131 not be passed to unichr. */
132 return 0xFFFF;
133#endif
134}
135
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000136/* --- Bloom Filters ----------------------------------------------------- */
137
138/* stuff to implement simple "bloom filters" for Unicode characters.
139 to keep things simple, we use a single bitmask, using the least 5
140 bits from each unicode characters as the bit index. */
141
142/* the linebreak mask is set up by Unicode_Init below */
143
144#define BLOOM_MASK unsigned long
145
146static BLOOM_MASK bloom_linebreak;
147
148#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
149
150#define BLOOM_LINEBREAK(ch)\
151 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
152
153LOCAL(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
154{
155 /* calculate simple bloom-style bitmask for a given unicode string */
156
157 long mask;
158 Py_ssize_t i;
159
160 mask = 0;
161 for (i = 0; i < len; i++)
162 mask |= (1 << (ptr[i] & 0x1F));
163
164 return mask;
165}
166
167LOCAL(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
168{
169 Py_ssize_t i;
170
171 for (i = 0; i < setlen; i++)
172 if (set[i] == chr)
173 return 1;
174
Fredrik Lundh77633512006-05-23 19:47:35 +0000175 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000176}
177
178#define BLOOM_MEMBER(mask, chr, set, setlen)\
179 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
180
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181/* --- Unicode Object ----------------------------------------------------- */
182
183static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000185 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000186{
187 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000188
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000189 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000191 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 /* Resizing shared object (unicode_empty or single character
194 objects) in-place is not allowed. Use PyUnicode_Resize()
195 instead ! */
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000196 if (unicode == unicode_empty ||
197 (unicode->length == 1 &&
198 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 return -1;
203 }
204
205 /* We allocate one more byte to make sure the string is
206 Ux0000 terminated -- XXX is this needed ? */
207 oldstr = unicode->str;
208 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
209 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000210 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 PyErr_NoMemory();
212 return -1;
213 }
214 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000215 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000217 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000219 if (unicode->defenc) {
220 Py_DECREF(unicode->defenc);
221 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000222 }
223 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000224
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 return 0;
226}
227
228/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000229 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230
231 XXX This allocator could further be enhanced by assuring that the
232 free list never reduces its size below 1.
233
234*/
235
236static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000237PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238{
239 register PyUnicodeObject *unicode;
240
Tim Petersced69f82003-09-16 20:30:58 +0000241 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 if (length == 0 && unicode_empty != NULL) {
243 Py_INCREF(unicode_empty);
244 return unicode_empty;
245 }
246
247 /* Unicode freelist & memory allocation */
248 if (unicode_freelist) {
249 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000250 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 /* Keep-Alive optimization: we only upsize the buffer,
254 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000255 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000256 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000257 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000258 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 }
260 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000261 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000263 }
264 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265 }
266 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000267 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268 if (unicode == NULL)
269 return NULL;
270 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
271 }
272
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000273 if (!unicode->str) {
274 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000275 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000276 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000277 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000278 * the caller fails before initializing str -- unicode_resize()
279 * reads str[0], and the Keep-Alive optimization can keep memory
280 * allocated for str alive across a call to unicode_dealloc(unicode).
281 * We don't want unicode_resize to read uninitialized memory in
282 * that case.
283 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000284 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000285 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000286 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000288 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000290
291 onError:
292 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000293 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000294 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000295}
296
297static
Guido van Rossum9475a232001-10-05 20:51:39 +0000298void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000300 if (PyUnicode_CheckExact(unicode) &&
301 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 /* Keep-Alive optimization */
303 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000304 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 unicode->str = NULL;
306 unicode->length = 0;
307 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000308 if (unicode->defenc) {
309 Py_DECREF(unicode->defenc);
310 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000311 }
312 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313 *(PyUnicodeObject **)unicode = unicode_freelist;
314 unicode_freelist = unicode;
315 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000318 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000319 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000320 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321 }
322}
323
Martin v. Löwis18e16552006-02-15 17:27:45 +0000324int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000325{
326 register PyUnicodeObject *v;
327
328 /* Argument checks */
329 if (unicode == NULL) {
330 PyErr_BadInternalCall();
331 return -1;
332 }
333 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000334 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000335 PyErr_BadInternalCall();
336 return -1;
337 }
338
339 /* Resizing unicode_empty and single character objects is not
340 possible since these are being shared. We simply return a fresh
341 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000342 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000343 (v == unicode_empty || v->length == 1)) {
344 PyUnicodeObject *w = _PyUnicode_New(length);
345 if (w == NULL)
346 return -1;
347 Py_UNICODE_COPY(w->str, v->str,
348 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000349 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000350 *unicode = (PyObject *)w;
351 return 0;
352 }
353
354 /* Note that we don't have to modify *unicode for unshared Unicode
355 objects, since we can modify them in-place. */
356 return unicode_resize(v, length);
357}
358
359/* Internal API for use in unicodeobject.c only ! */
360#define _PyUnicode_Resize(unicodevar, length) \
361 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
362
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000364 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365{
366 PyUnicodeObject *unicode;
367
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000368 /* If the Unicode data is known at construction time, we can apply
369 some optimizations which share commonly used objects. */
370 if (u != NULL) {
371
372 /* Optimization for empty strings */
373 if (size == 0 && unicode_empty != NULL) {
374 Py_INCREF(unicode_empty);
375 return (PyObject *)unicode_empty;
376 }
377
378 /* Single character Unicode objects in the Latin-1 range are
379 shared when using this constructor */
380 if (size == 1 && *u < 256) {
381 unicode = unicode_latin1[*u];
382 if (!unicode) {
383 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000384 if (!unicode)
385 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000386 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000387 unicode_latin1[*u] = unicode;
388 }
389 Py_INCREF(unicode);
390 return (PyObject *)unicode;
391 }
392 }
Tim Petersced69f82003-09-16 20:30:58 +0000393
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394 unicode = _PyUnicode_New(size);
395 if (!unicode)
396 return NULL;
397
398 /* Copy the Unicode data into the new object */
399 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000400 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401
402 return (PyObject *)unicode;
403}
404
405#ifdef HAVE_WCHAR_H
406
407PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000408 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000409{
410 PyUnicodeObject *unicode;
411
412 if (w == NULL) {
413 PyErr_BadInternalCall();
414 return NULL;
415 }
416
417 unicode = _PyUnicode_New(size);
418 if (!unicode)
419 return NULL;
420
421 /* Copy the wchar_t data into the new object */
422#ifdef HAVE_USABLE_WCHAR_T
423 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000424#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 {
426 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000427 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000429 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430 *u++ = *w++;
431 }
432#endif
433
434 return (PyObject *)unicode;
435}
436
Martin v. Löwis18e16552006-02-15 17:27:45 +0000437Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
438 wchar_t *w,
439 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440{
441 if (unicode == NULL) {
442 PyErr_BadInternalCall();
443 return -1;
444 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000445
446 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000447 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000448 size = PyUnicode_GET_SIZE(unicode) + 1;
449
Guido van Rossumd57fd912000-03-10 22:53:23 +0000450#ifdef HAVE_USABLE_WCHAR_T
451 memcpy(w, unicode->str, size * sizeof(wchar_t));
452#else
453 {
454 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000455 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000457 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000458 *w++ = *u++;
459 }
460#endif
461
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000462 if (size > PyUnicode_GET_SIZE(unicode))
463 return PyUnicode_GET_SIZE(unicode);
464 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000465 return size;
466}
467
468#endif
469
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000470PyObject *PyUnicode_FromOrdinal(int ordinal)
471{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000472 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000473
474#ifdef Py_UNICODE_WIDE
475 if (ordinal < 0 || ordinal > 0x10ffff) {
476 PyErr_SetString(PyExc_ValueError,
477 "unichr() arg not in range(0x110000) "
478 "(wide Python build)");
479 return NULL;
480 }
481#else
482 if (ordinal < 0 || ordinal > 0xffff) {
483 PyErr_SetString(PyExc_ValueError,
484 "unichr() arg not in range(0x10000) "
485 "(narrow Python build)");
486 return NULL;
487 }
488#endif
489
Hye-Shik Chang40574832004-04-06 07:24:51 +0000490 s[0] = (Py_UNICODE)ordinal;
491 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000492}
493
Guido van Rossumd57fd912000-03-10 22:53:23 +0000494PyObject *PyUnicode_FromObject(register PyObject *obj)
495{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000496 /* XXX Perhaps we should make this API an alias of
497 PyObject_Unicode() instead ?! */
498 if (PyUnicode_CheckExact(obj)) {
499 Py_INCREF(obj);
500 return obj;
501 }
502 if (PyUnicode_Check(obj)) {
503 /* For a Unicode subtype that's not a Unicode object,
504 return a true Unicode object with the same data. */
505 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
506 PyUnicode_GET_SIZE(obj));
507 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
509}
510
511PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
512 const char *encoding,
513 const char *errors)
514{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000515 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000516 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000517 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000518
Guido van Rossumd57fd912000-03-10 22:53:23 +0000519 if (obj == NULL) {
520 PyErr_BadInternalCall();
521 return NULL;
522 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000523
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000524#if 0
525 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000526 that no encodings is given and then redirect to
527 PyObject_Unicode() which then applies the additional logic for
528 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000530 NOTE: This API should really only be used for object which
531 represent *encoded* Unicode !
532
533 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 if (PyUnicode_Check(obj)) {
535 if (encoding) {
536 PyErr_SetString(PyExc_TypeError,
537 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000538 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000539 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000540 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000541 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000542#else
543 if (PyUnicode_Check(obj)) {
544 PyErr_SetString(PyExc_TypeError,
545 "decoding Unicode is not supported");
546 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000547 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000548#endif
549
550 /* Coerce object */
551 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000552 s = PyString_AS_STRING(obj);
553 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
556 /* Overwrite the error message with something more useful in
557 case of a TypeError. */
558 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000559 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000560 "coercing to Unicode: need string or buffer, "
561 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000562 obj->ob_type->tp_name);
563 goto onError;
564 }
Tim Petersced69f82003-09-16 20:30:58 +0000565
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000566 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567 if (len == 0) {
568 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000570 }
Tim Petersced69f82003-09-16 20:30:58 +0000571 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000573
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000574 return v;
575
576 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000577 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000578}
579
580PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000581 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582 const char *encoding,
583 const char *errors)
584{
585 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000586
587 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000588 encoding = PyUnicode_GetDefaultEncoding();
589
590 /* Shortcuts for common default encodings */
591 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000592 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000593 else if (strcmp(encoding, "latin-1") == 0)
594 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000595#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
596 else if (strcmp(encoding, "mbcs") == 0)
597 return PyUnicode_DecodeMBCS(s, size, errors);
598#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000599 else if (strcmp(encoding, "ascii") == 0)
600 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000601
602 /* Decode via the codec registry */
603 buffer = PyBuffer_FromMemory((void *)s, size);
604 if (buffer == NULL)
605 goto onError;
606 unicode = PyCodec_Decode(buffer, encoding, errors);
607 if (unicode == NULL)
608 goto onError;
609 if (!PyUnicode_Check(unicode)) {
610 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000611 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612 unicode->ob_type->tp_name);
613 Py_DECREF(unicode);
614 goto onError;
615 }
616 Py_DECREF(buffer);
617 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000618
Guido van Rossumd57fd912000-03-10 22:53:23 +0000619 onError:
620 Py_XDECREF(buffer);
621 return NULL;
622}
623
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000624PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
625 const char *encoding,
626 const char *errors)
627{
628 PyObject *v;
629
630 if (!PyUnicode_Check(unicode)) {
631 PyErr_BadArgument();
632 goto onError;
633 }
634
635 if (encoding == NULL)
636 encoding = PyUnicode_GetDefaultEncoding();
637
638 /* Decode via the codec registry */
639 v = PyCodec_Decode(unicode, encoding, errors);
640 if (v == NULL)
641 goto onError;
642 return v;
643
644 onError:
645 return NULL;
646}
647
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000649 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 const char *encoding,
651 const char *errors)
652{
653 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000654
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 unicode = PyUnicode_FromUnicode(s, size);
656 if (unicode == NULL)
657 return NULL;
658 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
659 Py_DECREF(unicode);
660 return v;
661}
662
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000663PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
664 const char *encoding,
665 const char *errors)
666{
667 PyObject *v;
668
669 if (!PyUnicode_Check(unicode)) {
670 PyErr_BadArgument();
671 goto onError;
672 }
673
674 if (encoding == NULL)
675 encoding = PyUnicode_GetDefaultEncoding();
676
677 /* Encode via the codec registry */
678 v = PyCodec_Encode(unicode, encoding, errors);
679 if (v == NULL)
680 goto onError;
681 return v;
682
683 onError:
684 return NULL;
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
688 const char *encoding,
689 const char *errors)
690{
691 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000692
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693 if (!PyUnicode_Check(unicode)) {
694 PyErr_BadArgument();
695 goto onError;
696 }
Fred Drakee4315f52000-05-09 19:53:39 +0000697
Tim Petersced69f82003-09-16 20:30:58 +0000698 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000699 encoding = PyUnicode_GetDefaultEncoding();
700
701 /* Shortcuts for common default encodings */
702 if (errors == NULL) {
703 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000704 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000705 else if (strcmp(encoding, "latin-1") == 0)
706 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000707#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
708 else if (strcmp(encoding, "mbcs") == 0)
709 return PyUnicode_AsMBCSString(unicode);
710#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000711 else if (strcmp(encoding, "ascii") == 0)
712 return PyUnicode_AsASCIIString(unicode);
713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714
715 /* Encode via the codec registry */
716 v = PyCodec_Encode(unicode, encoding, errors);
717 if (v == NULL)
718 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000719 if (!PyString_Check(v)) {
720 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000721 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000722 v->ob_type->tp_name);
723 Py_DECREF(v);
724 goto onError;
725 }
726 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000727
Guido van Rossumd57fd912000-03-10 22:53:23 +0000728 onError:
729 return NULL;
730}
731
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000732PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
733 const char *errors)
734{
735 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
736
737 if (v)
738 return v;
739 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
740 if (v && errors == NULL)
741 ((PyUnicodeObject *)unicode)->defenc = v;
742 return v;
743}
744
Guido van Rossumd57fd912000-03-10 22:53:23 +0000745Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
746{
747 if (!PyUnicode_Check(unicode)) {
748 PyErr_BadArgument();
749 goto onError;
750 }
751 return PyUnicode_AS_UNICODE(unicode);
752
753 onError:
754 return NULL;
755}
756
Martin v. Löwis18e16552006-02-15 17:27:45 +0000757Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000758{
759 if (!PyUnicode_Check(unicode)) {
760 PyErr_BadArgument();
761 goto onError;
762 }
763 return PyUnicode_GET_SIZE(unicode);
764
765 onError:
766 return -1;
767}
768
Thomas Wouters78890102000-07-22 19:25:51 +0000769const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000770{
771 return unicode_default_encoding;
772}
773
774int PyUnicode_SetDefaultEncoding(const char *encoding)
775{
776 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000777
Fred Drakee4315f52000-05-09 19:53:39 +0000778 /* Make sure the encoding is valid. As side effect, this also
779 loads the encoding into the codec registry cache. */
780 v = _PyCodec_Lookup(encoding);
781 if (v == NULL)
782 goto onError;
783 Py_DECREF(v);
784 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000785 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000786 sizeof(unicode_default_encoding));
787 return 0;
788
789 onError:
790 return -1;
791}
792
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000793/* error handling callback helper:
794 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000795 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000796 and adjust various state variables.
797 return 0 on success, -1 on error
798*/
799
800static
801int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
802 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000803 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
804 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000805{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000806 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000807
808 PyObject *restuple = NULL;
809 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000810 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
811 Py_ssize_t requiredsize;
812 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000813 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000814 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000815 int res = -1;
816
817 if (*errorHandler == NULL) {
818 *errorHandler = PyCodec_LookupError(errors);
819 if (*errorHandler == NULL)
820 goto onError;
821 }
822
823 if (*exceptionObject == NULL) {
824 *exceptionObject = PyUnicodeDecodeError_Create(
825 encoding, input, insize, *startinpos, *endinpos, reason);
826 if (*exceptionObject == NULL)
827 goto onError;
828 }
829 else {
830 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
831 goto onError;
832 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
833 goto onError;
834 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
835 goto onError;
836 }
837
838 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
839 if (restuple == NULL)
840 goto onError;
841 if (!PyTuple_Check(restuple)) {
842 PyErr_Format(PyExc_TypeError, &argparse[4]);
843 goto onError;
844 }
845 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
846 goto onError;
847 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000848 newpos = insize+newpos;
849 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000850 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000851 goto onError;
852 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000853
854 /* need more space? (at least enough for what we
855 have+the replacement+the rest of the string (starting
856 at the new input position), so we won't have to check space
857 when there are no errors in the rest of the string) */
858 repptr = PyUnicode_AS_UNICODE(repunicode);
859 repsize = PyUnicode_GET_SIZE(repunicode);
860 requiredsize = *outpos + repsize + insize-newpos;
861 if (requiredsize > outsize) {
862 if (requiredsize<2*outsize)
863 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000864 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000865 goto onError;
866 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
867 }
868 *endinpos = newpos;
869 *inptr = input + newpos;
870 Py_UNICODE_COPY(*outptr, repptr, repsize);
871 *outptr += repsize;
872 *outpos += repsize;
873 /* we made it! */
874 res = 0;
875
876 onError:
877 Py_XDECREF(restuple);
878 return res;
879}
880
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881/* --- UTF-7 Codec -------------------------------------------------------- */
882
883/* see RFC2152 for details */
884
Tim Petersced69f82003-09-16 20:30:58 +0000885static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000886char utf7_special[128] = {
887 /* indicate whether a UTF-7 character is special i.e. cannot be directly
888 encoded:
889 0 - not special
890 1 - special
891 2 - whitespace (optional)
892 3 - RFC2152 Set O (optional) */
893 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
894 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
895 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
897 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
898 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
899 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
901
902};
903
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000904/* Note: The comparison (c) <= 0 is a trick to work-around gcc
905 warnings about the comparison always being false; since
906 utf7_special[0] is 1, we can safely make that one comparison
907 true */
908
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000909#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000910 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000911 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000912 (encodeO && (utf7_special[(c)] == 3)))
913
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000914#define B64(n) \
915 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
916#define B64CHAR(c) \
917 (isalnum(c) || (c) == '+' || (c) == '/')
918#define UB64(c) \
919 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
920 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000922#define ENCODE(out, ch, bits) \
923 while (bits >= 6) { \
924 *out++ = B64(ch >> (bits-6)); \
925 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000926 }
927
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000928#define DECODE(out, ch, bits, surrogate) \
929 while (bits >= 16) { \
930 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
931 bits -= 16; \
932 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000933 /* We have already generated an error for the high surrogate \
934 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000935 surrogate = 0; \
936 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000937 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000938 it in a 16-bit character */ \
939 surrogate = 1; \
940 errmsg = "code pairs are not supported"; \
941 goto utf7Error; \
942 } else { \
943 *out++ = outCh; \
944 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000945 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000946
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000947PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000948 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 const char *errors)
950{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000951 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000952 Py_ssize_t startinpos;
953 Py_ssize_t endinpos;
954 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000955 const char *e;
956 PyUnicodeObject *unicode;
957 Py_UNICODE *p;
958 const char *errmsg = "";
959 int inShift = 0;
960 unsigned int bitsleft = 0;
961 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000962 int surrogate = 0;
963 PyObject *errorHandler = NULL;
964 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965
966 unicode = _PyUnicode_New(size);
967 if (!unicode)
968 return NULL;
969 if (size == 0)
970 return (PyObject *)unicode;
971
972 p = unicode->str;
973 e = s + size;
974
975 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000976 Py_UNICODE ch;
977 restart:
978 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000979
980 if (inShift) {
981 if ((ch == '-') || !B64CHAR(ch)) {
982 inShift = 0;
983 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000984
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000985 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
986 if (bitsleft >= 6) {
987 /* The shift sequence has a partial character in it. If
988 bitsleft < 6 then we could just classify it as padding
989 but that is not the case here */
990
991 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000992 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000993 }
994 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000995 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000996 here so indicate the potential of a misencoded character. */
997
998 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
999 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1000 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001001 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 }
1003
1004 if (ch == '-') {
1005 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001006 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 inShift = 1;
1008 }
1009 } else if (SPECIAL(ch,0,0)) {
1010 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001011 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001012 } else {
1013 *p++ = ch;
1014 }
1015 } else {
1016 charsleft = (charsleft << 6) | UB64(ch);
1017 bitsleft += 6;
1018 s++;
1019 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1020 }
1021 }
1022 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001023 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001024 s++;
1025 if (s < e && *s == '-') {
1026 s++;
1027 *p++ = '+';
1028 } else
1029 {
1030 inShift = 1;
1031 bitsleft = 0;
1032 }
1033 }
1034 else if (SPECIAL(ch,0,0)) {
1035 errmsg = "unexpected special character";
1036 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001037 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001038 }
1039 else {
1040 *p++ = ch;
1041 s++;
1042 }
1043 continue;
1044 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001045 outpos = p-PyUnicode_AS_UNICODE(unicode);
1046 endinpos = s-starts;
1047 if (unicode_decode_call_errorhandler(
1048 errors, &errorHandler,
1049 "utf7", errmsg,
1050 starts, size, &startinpos, &endinpos, &exc, &s,
1051 (PyObject **)&unicode, &outpos, &p))
1052 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001053 }
1054
1055 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001056 outpos = p-PyUnicode_AS_UNICODE(unicode);
1057 endinpos = size;
1058 if (unicode_decode_call_errorhandler(
1059 errors, &errorHandler,
1060 "utf7", "unterminated shift sequence",
1061 starts, size, &startinpos, &endinpos, &exc, &s,
1062 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001063 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001064 if (s < e)
1065 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001066 }
1067
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001068 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001069 goto onError;
1070
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071 Py_XDECREF(errorHandler);
1072 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 return (PyObject *)unicode;
1074
1075onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001076 Py_XDECREF(errorHandler);
1077 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001078 Py_DECREF(unicode);
1079 return NULL;
1080}
1081
1082
1083PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001084 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001085 int encodeSetO,
1086 int encodeWhiteSpace,
1087 const char *errors)
1088{
1089 PyObject *v;
1090 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001091 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001092 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001093 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001094 unsigned int bitsleft = 0;
1095 unsigned long charsleft = 0;
1096 char * out;
1097 char * start;
1098
1099 if (size == 0)
1100 return PyString_FromStringAndSize(NULL, 0);
1101
1102 v = PyString_FromStringAndSize(NULL, cbAllocated);
1103 if (v == NULL)
1104 return NULL;
1105
1106 start = out = PyString_AS_STRING(v);
1107 for (;i < size; ++i) {
1108 Py_UNICODE ch = s[i];
1109
1110 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001111 if (ch == '+') {
1112 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001113 *out++ = '-';
1114 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1115 charsleft = ch;
1116 bitsleft = 16;
1117 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001118 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001120 } else {
1121 *out++ = (char) ch;
1122 }
1123 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001124 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1125 *out++ = B64(charsleft << (6-bitsleft));
1126 charsleft = 0;
1127 bitsleft = 0;
1128 /* Characters not in the BASE64 set implicitly unshift the sequence
1129 so no '-' is required, except if the character is itself a '-' */
1130 if (B64CHAR(ch) || ch == '-') {
1131 *out++ = '-';
1132 }
1133 inShift = 0;
1134 *out++ = (char) ch;
1135 } else {
1136 bitsleft += 16;
1137 charsleft = (charsleft << 16) | ch;
1138 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1139
1140 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001141 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001142 or '-' then the shift sequence will be terminated implicitly and we
1143 don't have to insert a '-'. */
1144
1145 if (bitsleft == 0) {
1146 if (i + 1 < size) {
1147 Py_UNICODE ch2 = s[i+1];
1148
1149 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001150
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001151 } else if (B64CHAR(ch2) || ch2 == '-') {
1152 *out++ = '-';
1153 inShift = 0;
1154 } else {
1155 inShift = 0;
1156 }
1157
1158 }
1159 else {
1160 *out++ = '-';
1161 inShift = 0;
1162 }
1163 }
Tim Petersced69f82003-09-16 20:30:58 +00001164 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001165 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001166 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001167 if (bitsleft) {
1168 *out++= B64(charsleft << (6-bitsleft) );
1169 *out++ = '-';
1170 }
1171
Tim Peters5de98422002-04-27 18:44:32 +00001172 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001173 return v;
1174}
1175
1176#undef SPECIAL
1177#undef B64
1178#undef B64CHAR
1179#undef UB64
1180#undef ENCODE
1181#undef DECODE
1182
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183/* --- UTF-8 Codec -------------------------------------------------------- */
1184
Tim Petersced69f82003-09-16 20:30:58 +00001185static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186char utf8_code_length[256] = {
1187 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1188 illegal prefix. see RFC 2279 for details */
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1193 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1194 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1195 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1196 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1201 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1202 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1203 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1204 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1205};
1206
Guido van Rossumd57fd912000-03-10 22:53:23 +00001207PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001208 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 const char *errors)
1210{
Walter Dörwald69652032004-09-07 20:24:22 +00001211 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1212}
1213
1214PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001215 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001216 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001217 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001218{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001219 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001221 Py_ssize_t startinpos;
1222 Py_ssize_t endinpos;
1223 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 const char *e;
1225 PyUnicodeObject *unicode;
1226 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001227 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001228 PyObject *errorHandler = NULL;
1229 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230
1231 /* Note: size will always be longer than the resulting Unicode
1232 character count */
1233 unicode = _PyUnicode_New(size);
1234 if (!unicode)
1235 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001236 if (size == 0) {
1237 if (consumed)
1238 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001241
1242 /* Unpack UTF-8 encoded data */
1243 p = unicode->str;
1244 e = s + size;
1245
1246 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001247 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248
1249 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001250 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 s++;
1252 continue;
1253 }
1254
1255 n = utf8_code_length[ch];
1256
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001257 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001258 if (consumed)
1259 break;
1260 else {
1261 errmsg = "unexpected end of data";
1262 startinpos = s-starts;
1263 endinpos = size;
1264 goto utf8Error;
1265 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001266 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267
1268 switch (n) {
1269
1270 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001271 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001272 startinpos = s-starts;
1273 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001274 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275
1276 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001277 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278 startinpos = s-starts;
1279 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001280 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281
1282 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001283 if ((s[1] & 0xc0) != 0x80) {
1284 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001285 startinpos = s-starts;
1286 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001287 goto utf8Error;
1288 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001290 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001291 startinpos = s-starts;
1292 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001293 errmsg = "illegal encoding";
1294 goto utf8Error;
1295 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001298 break;
1299
1300 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001301 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001302 (s[2] & 0xc0) != 0x80) {
1303 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001304 startinpos = s-starts;
1305 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001306 goto utf8Error;
1307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001309 if (ch < 0x0800) {
1310 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001311 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001312
1313 XXX For wide builds (UCS-4) we should probably try
1314 to recombine the surrogates into a single code
1315 unit.
1316 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001317 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001318 startinpos = s-starts;
1319 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001320 goto utf8Error;
1321 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001323 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001324 break;
1325
1326 case 4:
1327 if ((s[1] & 0xc0) != 0x80 ||
1328 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001329 (s[3] & 0xc0) != 0x80) {
1330 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331 startinpos = s-starts;
1332 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001333 goto utf8Error;
1334 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001335 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1336 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1337 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001338 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001339 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001340 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001341 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001342 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001343 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344 startinpos = s-starts;
1345 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001346 goto utf8Error;
1347 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001348#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001349 *p++ = (Py_UNICODE)ch;
1350#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001351 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001352
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001353 /* translate from 10000..10FFFF to 0..FFFF */
1354 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001355
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001356 /* high surrogate = top 10 bits added to D800 */
1357 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001358
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001359 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001360 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001361#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362 break;
1363
1364 default:
1365 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001366 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001367 startinpos = s-starts;
1368 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370 }
1371 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001372 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001373
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001374 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001375 outpos = p-PyUnicode_AS_UNICODE(unicode);
1376 if (unicode_decode_call_errorhandler(
1377 errors, &errorHandler,
1378 "utf8", errmsg,
1379 starts, size, &startinpos, &endinpos, &exc, &s,
1380 (PyObject **)&unicode, &outpos, &p))
1381 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 }
Walter Dörwald69652032004-09-07 20:24:22 +00001383 if (consumed)
1384 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001385
1386 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001387 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001388 goto onError;
1389
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 Py_XDECREF(errorHandler);
1391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 return (PyObject *)unicode;
1393
1394onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001395 Py_XDECREF(errorHandler);
1396 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001397 Py_DECREF(unicode);
1398 return NULL;
1399}
1400
Tim Peters602f7402002-04-27 18:03:26 +00001401/* Allocation strategy: if the string is short, convert into a stack buffer
1402 and allocate exactly as much space needed at the end. Else allocate the
1403 maximum possible needed (4 result bytes per Unicode character), and return
1404 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001405*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001406PyObject *
1407PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001408 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001409 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410{
Tim Peters602f7402002-04-27 18:03:26 +00001411#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001412
Martin v. Löwis18e16552006-02-15 17:27:45 +00001413 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001414 PyObject *v; /* result string object */
1415 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001416 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001417 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001418 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001419
Tim Peters602f7402002-04-27 18:03:26 +00001420 assert(s != NULL);
1421 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001422
Tim Peters602f7402002-04-27 18:03:26 +00001423 if (size <= MAX_SHORT_UNICHARS) {
1424 /* Write into the stack buffer; nallocated can't overflow.
1425 * At the end, we'll allocate exactly as much heap space as it
1426 * turns out we need.
1427 */
1428 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1429 v = NULL; /* will allocate after we're done */
1430 p = stackbuf;
1431 }
1432 else {
1433 /* Overallocate on the heap, and give the excess back at the end. */
1434 nallocated = size * 4;
1435 if (nallocated / 4 != size) /* overflow! */
1436 return PyErr_NoMemory();
1437 v = PyString_FromStringAndSize(NULL, nallocated);
1438 if (v == NULL)
1439 return NULL;
1440 p = PyString_AS_STRING(v);
1441 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001442
Tim Peters602f7402002-04-27 18:03:26 +00001443 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001444 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001445
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001446 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001447 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001448 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001449
Guido van Rossumd57fd912000-03-10 22:53:23 +00001450 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001451 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001452 *p++ = (char)(0xc0 | (ch >> 6));
1453 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001454 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001455 else {
Tim Peters602f7402002-04-27 18:03:26 +00001456 /* Encode UCS2 Unicode ordinals */
1457 if (ch < 0x10000) {
1458 /* Special case: check for high surrogate */
1459 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1460 Py_UCS4 ch2 = s[i];
1461 /* Check for low surrogate and combine the two to
1462 form a UCS4 value */
1463 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001464 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001465 i++;
1466 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001467 }
Tim Peters602f7402002-04-27 18:03:26 +00001468 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001469 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001470 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001471 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1472 *p++ = (char)(0x80 | (ch & 0x3f));
1473 continue;
1474 }
1475encodeUCS4:
1476 /* Encode UCS4 Unicode ordinals */
1477 *p++ = (char)(0xf0 | (ch >> 18));
1478 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1479 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1480 *p++ = (char)(0x80 | (ch & 0x3f));
1481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001482 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001483
Tim Peters602f7402002-04-27 18:03:26 +00001484 if (v == NULL) {
1485 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001486 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001487 assert(nneeded <= nallocated);
1488 v = PyString_FromStringAndSize(stackbuf, nneeded);
1489 }
1490 else {
1491 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001492 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001493 assert(nneeded <= nallocated);
1494 _PyString_Resize(&v, nneeded);
1495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001497
Tim Peters602f7402002-04-27 18:03:26 +00001498#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001499}
1500
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1502{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503 if (!PyUnicode_Check(unicode)) {
1504 PyErr_BadArgument();
1505 return NULL;
1506 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001507 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1508 PyUnicode_GET_SIZE(unicode),
1509 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001510}
1511
1512/* --- UTF-16 Codec ------------------------------------------------------- */
1513
Tim Peters772747b2001-08-09 22:21:55 +00001514PyObject *
1515PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001516 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001517 const char *errors,
1518 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519{
Walter Dörwald69652032004-09-07 20:24:22 +00001520 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1521}
1522
1523PyObject *
1524PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001525 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001526 const char *errors,
1527 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001528 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001529{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001530 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001531 Py_ssize_t startinpos;
1532 Py_ssize_t endinpos;
1533 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001534 PyUnicodeObject *unicode;
1535 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001536 const unsigned char *q, *e;
1537 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001538 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001539 /* Offsets from q for retrieving byte pairs in the right order. */
1540#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1541 int ihi = 1, ilo = 0;
1542#else
1543 int ihi = 0, ilo = 1;
1544#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001545 PyObject *errorHandler = NULL;
1546 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547
1548 /* Note: size will always be longer than the resulting Unicode
1549 character count */
1550 unicode = _PyUnicode_New(size);
1551 if (!unicode)
1552 return NULL;
1553 if (size == 0)
1554 return (PyObject *)unicode;
1555
1556 /* Unpack UTF-16 encoded data */
1557 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001558 q = (unsigned char *)s;
1559 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560
1561 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001562 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001564 /* Check for BOM marks (U+FEFF) in the input and adjust current
1565 byte order setting accordingly. In native mode, the leading BOM
1566 mark is skipped, in all other modes, it is copied to the output
1567 stream as-is (giving a ZWNBSP character). */
1568 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001569 if (size >= 2) {
1570 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001571#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001572 if (bom == 0xFEFF) {
1573 q += 2;
1574 bo = -1;
1575 }
1576 else if (bom == 0xFFFE) {
1577 q += 2;
1578 bo = 1;
1579 }
Tim Petersced69f82003-09-16 20:30:58 +00001580#else
Walter Dörwald69652032004-09-07 20:24:22 +00001581 if (bom == 0xFEFF) {
1582 q += 2;
1583 bo = 1;
1584 }
1585 else if (bom == 0xFFFE) {
1586 q += 2;
1587 bo = -1;
1588 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001589#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001590 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592
Tim Peters772747b2001-08-09 22:21:55 +00001593 if (bo == -1) {
1594 /* force LE */
1595 ihi = 1;
1596 ilo = 0;
1597 }
1598 else if (bo == 1) {
1599 /* force BE */
1600 ihi = 0;
1601 ilo = 1;
1602 }
1603
1604 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001605 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001606 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001607 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001608 if (consumed)
1609 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001610 errmsg = "truncated data";
1611 startinpos = ((const char *)q)-starts;
1612 endinpos = ((const char *)e)-starts;
1613 goto utf16Error;
1614 /* The remaining input chars are ignored if the callback
1615 chooses to skip the input */
1616 }
1617 ch = (q[ihi] << 8) | q[ilo];
1618
Tim Peters772747b2001-08-09 22:21:55 +00001619 q += 2;
1620
Guido van Rossumd57fd912000-03-10 22:53:23 +00001621 if (ch < 0xD800 || ch > 0xDFFF) {
1622 *p++ = ch;
1623 continue;
1624 }
1625
1626 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001627 if (q >= e) {
1628 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001629 startinpos = (((const char *)q)-2)-starts;
1630 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001631 goto utf16Error;
1632 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001633 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001634 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1635 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001636 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001637#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001638 *p++ = ch;
1639 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001640#else
1641 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001642#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001643 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001644 }
1645 else {
1646 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001647 startinpos = (((const char *)q)-4)-starts;
1648 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001649 goto utf16Error;
1650 }
1651
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001653 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001654 startinpos = (((const char *)q)-2)-starts;
1655 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001656 /* Fall through to report the error */
1657
1658 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001659 outpos = p-PyUnicode_AS_UNICODE(unicode);
1660 if (unicode_decode_call_errorhandler(
1661 errors, &errorHandler,
1662 "utf16", errmsg,
1663 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1664 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001665 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001666 }
1667
1668 if (byteorder)
1669 *byteorder = bo;
1670
Walter Dörwald69652032004-09-07 20:24:22 +00001671 if (consumed)
1672 *consumed = (const char *)q-starts;
1673
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001675 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001676 goto onError;
1677
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001678 Py_XDECREF(errorHandler);
1679 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001680 return (PyObject *)unicode;
1681
1682onError:
1683 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001684 Py_XDECREF(errorHandler);
1685 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686 return NULL;
1687}
1688
Tim Peters772747b2001-08-09 22:21:55 +00001689PyObject *
1690PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001691 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001692 const char *errors,
1693 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694{
1695 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001696 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001697#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001698 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001699#else
1700 const int pairs = 0;
1701#endif
Tim Peters772747b2001-08-09 22:21:55 +00001702 /* Offsets from p for storing byte pairs in the right order. */
1703#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1704 int ihi = 1, ilo = 0;
1705#else
1706 int ihi = 0, ilo = 1;
1707#endif
1708
1709#define STORECHAR(CH) \
1710 do { \
1711 p[ihi] = ((CH) >> 8) & 0xff; \
1712 p[ilo] = (CH) & 0xff; \
1713 p += 2; \
1714 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001716#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001717 for (i = pairs = 0; i < size; i++)
1718 if (s[i] >= 0x10000)
1719 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001720#endif
Tim Petersced69f82003-09-16 20:30:58 +00001721 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001722 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723 if (v == NULL)
1724 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725
Tim Peters772747b2001-08-09 22:21:55 +00001726 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001728 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001729 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001730 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001731
1732 if (byteorder == -1) {
1733 /* force LE */
1734 ihi = 1;
1735 ilo = 0;
1736 }
1737 else if (byteorder == 1) {
1738 /* force BE */
1739 ihi = 0;
1740 ilo = 1;
1741 }
1742
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001743 while (size-- > 0) {
1744 Py_UNICODE ch = *s++;
1745 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001746#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001747 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001748 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1749 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001751#endif
Tim Peters772747b2001-08-09 22:21:55 +00001752 STORECHAR(ch);
1753 if (ch2)
1754 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001755 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001757#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758}
1759
1760PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1761{
1762 if (!PyUnicode_Check(unicode)) {
1763 PyErr_BadArgument();
1764 return NULL;
1765 }
1766 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1767 PyUnicode_GET_SIZE(unicode),
1768 NULL,
1769 0);
1770}
1771
1772/* --- Unicode Escape Codec ----------------------------------------------- */
1773
Fredrik Lundh06d12682001-01-24 07:59:11 +00001774static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001775
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001777 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778 const char *errors)
1779{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001780 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001781 Py_ssize_t startinpos;
1782 Py_ssize_t endinpos;
1783 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001784 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001786 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001788 char* message;
1789 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 PyObject *errorHandler = NULL;
1791 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001792
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 /* Escaped strings will always be longer than the resulting
1794 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001795 length after conversion to the true value.
1796 (but if the error callback returns a long replacement string
1797 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798 v = _PyUnicode_New(size);
1799 if (v == NULL)
1800 goto onError;
1801 if (size == 0)
1802 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001803
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001804 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001806
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807 while (s < end) {
1808 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001809 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811
1812 /* Non-escape characters are interpreted as Unicode ordinals */
1813 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001814 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 continue;
1816 }
1817
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819 /* \ - Escapes */
1820 s++;
1821 switch (*s++) {
1822
1823 /* \x escapes */
1824 case '\n': break;
1825 case '\\': *p++ = '\\'; break;
1826 case '\'': *p++ = '\''; break;
1827 case '\"': *p++ = '\"'; break;
1828 case 'b': *p++ = '\b'; break;
1829 case 'f': *p++ = '\014'; break; /* FF */
1830 case 't': *p++ = '\t'; break;
1831 case 'n': *p++ = '\n'; break;
1832 case 'r': *p++ = '\r'; break;
1833 case 'v': *p++ = '\013'; break; /* VT */
1834 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1835
1836 /* \OOO (octal) escapes */
1837 case '0': case '1': case '2': case '3':
1838 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001839 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001841 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001843 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001845 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 break;
1847
Fredrik Lundhccc74732001-02-18 22:13:49 +00001848 /* hex escapes */
1849 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001850 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001851 digits = 2;
1852 message = "truncated \\xXX escape";
1853 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854
Fredrik Lundhccc74732001-02-18 22:13:49 +00001855 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001857 digits = 4;
1858 message = "truncated \\uXXXX escape";
1859 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860
Fredrik Lundhccc74732001-02-18 22:13:49 +00001861 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001862 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001863 digits = 8;
1864 message = "truncated \\UXXXXXXXX escape";
1865 hexescape:
1866 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 outpos = p-PyUnicode_AS_UNICODE(v);
1868 if (s+digits>end) {
1869 endinpos = size;
1870 if (unicode_decode_call_errorhandler(
1871 errors, &errorHandler,
1872 "unicodeescape", "end of string in escape sequence",
1873 starts, size, &startinpos, &endinpos, &exc, &s,
1874 (PyObject **)&v, &outpos, &p))
1875 goto onError;
1876 goto nextByte;
1877 }
1878 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001879 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001880 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001881 endinpos = (s+i+1)-starts;
1882 if (unicode_decode_call_errorhandler(
1883 errors, &errorHandler,
1884 "unicodeescape", message,
1885 starts, size, &startinpos, &endinpos, &exc, &s,
1886 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001887 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001888 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001889 }
1890 chr = (chr<<4) & ~0xF;
1891 if (c >= '0' && c <= '9')
1892 chr += c - '0';
1893 else if (c >= 'a' && c <= 'f')
1894 chr += 10 + c - 'a';
1895 else
1896 chr += 10 + c - 'A';
1897 }
1898 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001899 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001900 /* _decoding_error will have already written into the
1901 target buffer. */
1902 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001903 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001904 /* when we get here, chr is a 32-bit unicode character */
1905 if (chr <= 0xffff)
1906 /* UCS-2 character */
1907 *p++ = (Py_UNICODE) chr;
1908 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001909 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001910 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001911#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912 *p++ = chr;
1913#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001914 chr -= 0x10000L;
1915 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001916 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001917#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001918 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001919 endinpos = s-starts;
1920 outpos = p-PyUnicode_AS_UNICODE(v);
1921 if (unicode_decode_call_errorhandler(
1922 errors, &errorHandler,
1923 "unicodeescape", "illegal Unicode character",
1924 starts, size, &startinpos, &endinpos, &exc, &s,
1925 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001926 goto onError;
1927 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001928 break;
1929
1930 /* \N{name} */
1931 case 'N':
1932 message = "malformed \\N character escape";
1933 if (ucnhash_CAPI == NULL) {
1934 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001935 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001936 m = PyImport_ImportModule("unicodedata");
1937 if (m == NULL)
1938 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001939 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001940 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001941 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001942 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001943 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001944 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001945 if (ucnhash_CAPI == NULL)
1946 goto ucnhashError;
1947 }
1948 if (*s == '{') {
1949 const char *start = s+1;
1950 /* look for the closing brace */
1951 while (*s != '}' && s < end)
1952 s++;
1953 if (s > start && s < end && *s == '}') {
1954 /* found a name. look it up in the unicode database */
1955 message = "unknown Unicode character name";
1956 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001957 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001958 goto store;
1959 }
1960 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001961 endinpos = s-starts;
1962 outpos = p-PyUnicode_AS_UNICODE(v);
1963 if (unicode_decode_call_errorhandler(
1964 errors, &errorHandler,
1965 "unicodeescape", message,
1966 starts, size, &startinpos, &endinpos, &exc, &s,
1967 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001968 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001969 break;
1970
1971 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001972 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001973 message = "\\ at end of string";
1974 s--;
1975 endinpos = s-starts;
1976 outpos = p-PyUnicode_AS_UNICODE(v);
1977 if (unicode_decode_call_errorhandler(
1978 errors, &errorHandler,
1979 "unicodeescape", message,
1980 starts, size, &startinpos, &endinpos, &exc, &s,
1981 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001982 goto onError;
1983 }
1984 else {
1985 *p++ = '\\';
1986 *p++ = (unsigned char)s[-1];
1987 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001988 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001990 nextByte:
1991 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001993 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001994 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001995 Py_XDECREF(errorHandler);
1996 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001998
Fredrik Lundhccc74732001-02-18 22:13:49 +00001999ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002000 PyErr_SetString(
2001 PyExc_UnicodeError,
2002 "\\N escapes not supported (can't load unicodedata module)"
2003 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002004 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002005 Py_XDECREF(errorHandler);
2006 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002007 return NULL;
2008
Fredrik Lundhccc74732001-02-18 22:13:49 +00002009onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002011 Py_XDECREF(errorHandler);
2012 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013 return NULL;
2014}
2015
2016/* Return a Unicode-Escape string version of the Unicode object.
2017
2018 If quotes is true, the string is enclosed in u"" or u'' quotes as
2019 appropriate.
2020
2021*/
2022
Fredrik Lundh347ee272006-05-24 16:35:18 +00002023LOCAL(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2024 Py_ssize_t size,
2025 Py_UNICODE ch)
2026{
2027 /* like wcschr, but doesn't stop at NULL characters */
2028
2029 while (size-- > 0) {
2030 if (*s == ch)
2031 return s;
2032 s++;
2033 }
2034
2035 return NULL;
2036}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002037
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038static
2039PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002040 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 int quotes)
2042{
2043 PyObject *repr;
2044 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002046 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047
2048 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
2049 if (repr == NULL)
2050 return NULL;
2051
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002052 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053
2054 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002056 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057 !findchar(s, size, '"')) ? '"' : '\'';
2058 }
2059 while (size-- > 0) {
2060 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002061
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002062 /* Escape quotes and backslashes */
2063 if ((quotes &&
2064 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 *p++ = '\\';
2066 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002067 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002068 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002069
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002070#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002071 /* Map 21-bit characters to '\U00xxxxxx' */
2072 else if (ch >= 0x10000) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00002073 Py_ssize_t offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002074
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002075 /* Resize the string if necessary */
2076 if (offset + 12 > PyString_GET_SIZE(repr)) {
2077 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002078 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002079 p = PyString_AS_STRING(repr) + offset;
2080 }
2081
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002082 *p++ = '\\';
2083 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002084 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2085 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2086 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2087 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2088 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2089 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2090 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002091 *p++ = hexdigit[ch & 0x0000000F];
2092 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002093 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002094#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002095 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2096 else if (ch >= 0xD800 && ch < 0xDC00) {
2097 Py_UNICODE ch2;
2098 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002099
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002100 ch2 = *s++;
2101 size--;
2102 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2103 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2104 *p++ = '\\';
2105 *p++ = 'U';
2106 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2107 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2108 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2109 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2110 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2111 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2112 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2113 *p++ = hexdigit[ucs & 0x0000000F];
2114 continue;
2115 }
2116 /* Fall through: isolated surrogates are copied as-is */
2117 s--;
2118 size++;
2119 }
2120
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002122 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 *p++ = '\\';
2124 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002125 *p++ = hexdigit[(ch >> 12) & 0x000F];
2126 *p++ = hexdigit[(ch >> 8) & 0x000F];
2127 *p++ = hexdigit[(ch >> 4) & 0x000F];
2128 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002130
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002131 /* Map special whitespace to '\t', \n', '\r' */
2132 else if (ch == '\t') {
2133 *p++ = '\\';
2134 *p++ = 't';
2135 }
2136 else if (ch == '\n') {
2137 *p++ = '\\';
2138 *p++ = 'n';
2139 }
2140 else if (ch == '\r') {
2141 *p++ = '\\';
2142 *p++ = 'r';
2143 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002144
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002145 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002146 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002148 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002149 *p++ = hexdigit[(ch >> 4) & 0x000F];
2150 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002151 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002152
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153 /* Copy everything else as-is */
2154 else
2155 *p++ = (char) ch;
2156 }
2157 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002158 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159
2160 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002161 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162 return repr;
2163}
2164
2165PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002166 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167{
2168 return unicodeescape_string(s, size, 0);
2169}
2170
2171PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2172{
2173 if (!PyUnicode_Check(unicode)) {
2174 PyErr_BadArgument();
2175 return NULL;
2176 }
2177 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2178 PyUnicode_GET_SIZE(unicode));
2179}
2180
2181/* --- Raw Unicode Escape Codec ------------------------------------------- */
2182
2183PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002184 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002185 const char *errors)
2186{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002187 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002188 Py_ssize_t startinpos;
2189 Py_ssize_t endinpos;
2190 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002192 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193 const char *end;
2194 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002195 PyObject *errorHandler = NULL;
2196 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002197
Guido van Rossumd57fd912000-03-10 22:53:23 +00002198 /* Escaped strings will always be longer than the resulting
2199 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002200 length after conversion to the true value. (But decoding error
2201 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002202 v = _PyUnicode_New(size);
2203 if (v == NULL)
2204 goto onError;
2205 if (size == 0)
2206 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002207 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002208 end = s + size;
2209 while (s < end) {
2210 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002211 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002213 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214
2215 /* Non-escape characters are interpreted as Unicode ordinals */
2216 if (*s != '\\') {
2217 *p++ = (unsigned char)*s++;
2218 continue;
2219 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002220 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221
2222 /* \u-escapes are only interpreted iff the number of leading
2223 backslashes if odd */
2224 bs = s;
2225 for (;s < end;) {
2226 if (*s != '\\')
2227 break;
2228 *p++ = (unsigned char)*s++;
2229 }
2230 if (((s - bs) & 1) == 0 ||
2231 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002232 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233 continue;
2234 }
2235 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002236 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002237 s++;
2238
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002239 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002240 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002241 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002242 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002243 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002244 endinpos = s-starts;
2245 if (unicode_decode_call_errorhandler(
2246 errors, &errorHandler,
2247 "rawunicodeescape", "truncated \\uXXXX",
2248 starts, size, &startinpos, &endinpos, &exc, &s,
2249 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002251 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002252 }
2253 x = (x<<4) & ~0xF;
2254 if (c >= '0' && c <= '9')
2255 x += c - '0';
2256 else if (c >= 'a' && c <= 'f')
2257 x += 10 + c - 'a';
2258 else
2259 x += 10 + c - 'A';
2260 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002261#ifndef Py_UNICODE_WIDE
2262 if (x > 0x10000) {
2263 if (unicode_decode_call_errorhandler(
2264 errors, &errorHandler,
2265 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2266 starts, size, &startinpos, &endinpos, &exc, &s,
2267 (PyObject **)&v, &outpos, &p))
2268 goto onError;
2269 }
2270#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002271 *p++ = x;
2272 nextByte:
2273 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002275 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002276 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002277 Py_XDECREF(errorHandler);
2278 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002280
Guido van Rossumd57fd912000-03-10 22:53:23 +00002281 onError:
2282 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002283 Py_XDECREF(errorHandler);
2284 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285 return NULL;
2286}
2287
2288PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002289 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002290{
2291 PyObject *repr;
2292 char *p;
2293 char *q;
2294
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002295 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002297#ifdef Py_UNICODE_WIDE
2298 repr = PyString_FromStringAndSize(NULL, 10 * size);
2299#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002301#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302 if (repr == NULL)
2303 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002304 if (size == 0)
2305 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306
2307 p = q = PyString_AS_STRING(repr);
2308 while (size-- > 0) {
2309 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002310#ifdef Py_UNICODE_WIDE
2311 /* Map 32-bit characters to '\Uxxxxxxxx' */
2312 if (ch >= 0x10000) {
2313 *p++ = '\\';
2314 *p++ = 'U';
2315 *p++ = hexdigit[(ch >> 28) & 0xf];
2316 *p++ = hexdigit[(ch >> 24) & 0xf];
2317 *p++ = hexdigit[(ch >> 20) & 0xf];
2318 *p++ = hexdigit[(ch >> 16) & 0xf];
2319 *p++ = hexdigit[(ch >> 12) & 0xf];
2320 *p++ = hexdigit[(ch >> 8) & 0xf];
2321 *p++ = hexdigit[(ch >> 4) & 0xf];
2322 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002323 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002324 else
2325#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002326 /* Map 16-bit characters to '\uxxxx' */
2327 if (ch >= 256) {
2328 *p++ = '\\';
2329 *p++ = 'u';
2330 *p++ = hexdigit[(ch >> 12) & 0xf];
2331 *p++ = hexdigit[(ch >> 8) & 0xf];
2332 *p++ = hexdigit[(ch >> 4) & 0xf];
2333 *p++ = hexdigit[ch & 15];
2334 }
2335 /* Copy everything else as-is */
2336 else
2337 *p++ = (char) ch;
2338 }
2339 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002340 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002341 return repr;
2342}
2343
2344PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2345{
2346 if (!PyUnicode_Check(unicode)) {
2347 PyErr_BadArgument();
2348 return NULL;
2349 }
2350 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2351 PyUnicode_GET_SIZE(unicode));
2352}
2353
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002354/* --- Unicode Internal Codec ------------------------------------------- */
2355
2356PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002357 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002358 const char *errors)
2359{
2360 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002361 Py_ssize_t startinpos;
2362 Py_ssize_t endinpos;
2363 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002364 PyUnicodeObject *v;
2365 Py_UNICODE *p;
2366 const char *end;
2367 const char *reason;
2368 PyObject *errorHandler = NULL;
2369 PyObject *exc = NULL;
2370
Neal Norwitzd43069c2006-01-08 01:12:10 +00002371#ifdef Py_UNICODE_WIDE
2372 Py_UNICODE unimax = PyUnicode_GetMax();
2373#endif
2374
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002375 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2376 if (v == NULL)
2377 goto onError;
2378 if (PyUnicode_GetSize((PyObject *)v) == 0)
2379 return (PyObject *)v;
2380 p = PyUnicode_AS_UNICODE(v);
2381 end = s + size;
2382
2383 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002384 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002385 /* We have to sanity check the raw data, otherwise doom looms for
2386 some malformed UCS-4 data. */
2387 if (
2388 #ifdef Py_UNICODE_WIDE
2389 *p > unimax || *p < 0 ||
2390 #endif
2391 end-s < Py_UNICODE_SIZE
2392 )
2393 {
2394 startinpos = s - starts;
2395 if (end-s < Py_UNICODE_SIZE) {
2396 endinpos = end-starts;
2397 reason = "truncated input";
2398 }
2399 else {
2400 endinpos = s - starts + Py_UNICODE_SIZE;
2401 reason = "illegal code point (> 0x10FFFF)";
2402 }
2403 outpos = p - PyUnicode_AS_UNICODE(v);
2404 if (unicode_decode_call_errorhandler(
2405 errors, &errorHandler,
2406 "unicode_internal", reason,
2407 starts, size, &startinpos, &endinpos, &exc, &s,
2408 (PyObject **)&v, &outpos, &p)) {
2409 goto onError;
2410 }
2411 }
2412 else {
2413 p++;
2414 s += Py_UNICODE_SIZE;
2415 }
2416 }
2417
Martin v. Löwis412fb672006-04-13 06:34:32 +00002418 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002419 goto onError;
2420 Py_XDECREF(errorHandler);
2421 Py_XDECREF(exc);
2422 return (PyObject *)v;
2423
2424 onError:
2425 Py_XDECREF(v);
2426 Py_XDECREF(errorHandler);
2427 Py_XDECREF(exc);
2428 return NULL;
2429}
2430
Guido van Rossumd57fd912000-03-10 22:53:23 +00002431/* --- Latin-1 Codec ------------------------------------------------------ */
2432
2433PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002434 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435 const char *errors)
2436{
2437 PyUnicodeObject *v;
2438 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002439
Guido van Rossumd57fd912000-03-10 22:53:23 +00002440 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002441 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002442 Py_UNICODE r = *(unsigned char*)s;
2443 return PyUnicode_FromUnicode(&r, 1);
2444 }
2445
Guido van Rossumd57fd912000-03-10 22:53:23 +00002446 v = _PyUnicode_New(size);
2447 if (v == NULL)
2448 goto onError;
2449 if (size == 0)
2450 return (PyObject *)v;
2451 p = PyUnicode_AS_UNICODE(v);
2452 while (size-- > 0)
2453 *p++ = (unsigned char)*s++;
2454 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002455
Guido van Rossumd57fd912000-03-10 22:53:23 +00002456 onError:
2457 Py_XDECREF(v);
2458 return NULL;
2459}
2460
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002461/* create or adjust a UnicodeEncodeError */
2462static void make_encode_exception(PyObject **exceptionObject,
2463 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002464 const Py_UNICODE *unicode, Py_ssize_t size,
2465 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002466 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002468 if (*exceptionObject == NULL) {
2469 *exceptionObject = PyUnicodeEncodeError_Create(
2470 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471 }
2472 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002473 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2474 goto onError;
2475 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2476 goto onError;
2477 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2478 goto onError;
2479 return;
2480 onError:
2481 Py_DECREF(*exceptionObject);
2482 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483 }
2484}
2485
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002486/* raises a UnicodeEncodeError */
2487static void raise_encode_exception(PyObject **exceptionObject,
2488 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002489 const Py_UNICODE *unicode, Py_ssize_t size,
2490 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002491 const char *reason)
2492{
2493 make_encode_exception(exceptionObject,
2494 encoding, unicode, size, startpos, endpos, reason);
2495 if (*exceptionObject != NULL)
2496 PyCodec_StrictErrors(*exceptionObject);
2497}
2498
2499/* error handling callback helper:
2500 build arguments, call the callback and check the arguments,
2501 put the result into newpos and return the replacement string, which
2502 has to be freed by the caller */
2503static PyObject *unicode_encode_call_errorhandler(const char *errors,
2504 PyObject **errorHandler,
2505 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002506 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2507 Py_ssize_t startpos, Py_ssize_t endpos,
2508 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002509{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002510 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002511
2512 PyObject *restuple;
2513 PyObject *resunicode;
2514
2515 if (*errorHandler == NULL) {
2516 *errorHandler = PyCodec_LookupError(errors);
2517 if (*errorHandler == NULL)
2518 return NULL;
2519 }
2520
2521 make_encode_exception(exceptionObject,
2522 encoding, unicode, size, startpos, endpos, reason);
2523 if (*exceptionObject == NULL)
2524 return NULL;
2525
2526 restuple = PyObject_CallFunctionObjArgs(
2527 *errorHandler, *exceptionObject, NULL);
2528 if (restuple == NULL)
2529 return NULL;
2530 if (!PyTuple_Check(restuple)) {
2531 PyErr_Format(PyExc_TypeError, &argparse[4]);
2532 Py_DECREF(restuple);
2533 return NULL;
2534 }
2535 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2536 &resunicode, newpos)) {
2537 Py_DECREF(restuple);
2538 return NULL;
2539 }
2540 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002541 *newpos = size+*newpos;
2542 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002543 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002544 Py_DECREF(restuple);
2545 return NULL;
2546 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002547 Py_INCREF(resunicode);
2548 Py_DECREF(restuple);
2549 return resunicode;
2550}
2551
2552static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002553 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002554 const char *errors,
2555 int limit)
2556{
2557 /* output object */
2558 PyObject *res;
2559 /* pointers to the beginning and end+1 of input */
2560 const Py_UNICODE *startp = p;
2561 const Py_UNICODE *endp = p + size;
2562 /* pointer to the beginning of the unencodable characters */
2563 /* const Py_UNICODE *badp = NULL; */
2564 /* pointer into the output */
2565 char *str;
2566 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002567 Py_ssize_t respos = 0;
2568 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002569 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2570 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002571 PyObject *errorHandler = NULL;
2572 PyObject *exc = NULL;
2573 /* the following variable is used for caching string comparisons
2574 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2575 int known_errorHandler = -1;
2576
2577 /* allocate enough for a simple encoding without
2578 replacements, if we need more, we'll resize */
2579 res = PyString_FromStringAndSize(NULL, size);
2580 if (res == NULL)
2581 goto onError;
2582 if (size == 0)
2583 return res;
2584 str = PyString_AS_STRING(res);
2585 ressize = size;
2586
2587 while (p<endp) {
2588 Py_UNICODE c = *p;
2589
2590 /* can we encode this? */
2591 if (c<limit) {
2592 /* no overflow check, because we know that the space is enough */
2593 *str++ = (char)c;
2594 ++p;
2595 }
2596 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002597 Py_ssize_t unicodepos = p-startp;
2598 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002599 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002600 Py_ssize_t repsize;
2601 Py_ssize_t newpos;
2602 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002603 Py_UNICODE *uni2;
2604 /* startpos for collecting unencodable chars */
2605 const Py_UNICODE *collstart = p;
2606 const Py_UNICODE *collend = p;
2607 /* find all unecodable characters */
2608 while ((collend < endp) && ((*collend)>=limit))
2609 ++collend;
2610 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2611 if (known_errorHandler==-1) {
2612 if ((errors==NULL) || (!strcmp(errors, "strict")))
2613 known_errorHandler = 1;
2614 else if (!strcmp(errors, "replace"))
2615 known_errorHandler = 2;
2616 else if (!strcmp(errors, "ignore"))
2617 known_errorHandler = 3;
2618 else if (!strcmp(errors, "xmlcharrefreplace"))
2619 known_errorHandler = 4;
2620 else
2621 known_errorHandler = 0;
2622 }
2623 switch (known_errorHandler) {
2624 case 1: /* strict */
2625 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2626 goto onError;
2627 case 2: /* replace */
2628 while (collstart++<collend)
2629 *str++ = '?'; /* fall through */
2630 case 3: /* ignore */
2631 p = collend;
2632 break;
2633 case 4: /* xmlcharrefreplace */
2634 respos = str-PyString_AS_STRING(res);
2635 /* determine replacement size (temporarily (mis)uses p) */
2636 for (p = collstart, repsize = 0; p < collend; ++p) {
2637 if (*p<10)
2638 repsize += 2+1+1;
2639 else if (*p<100)
2640 repsize += 2+2+1;
2641 else if (*p<1000)
2642 repsize += 2+3+1;
2643 else if (*p<10000)
2644 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002645#ifndef Py_UNICODE_WIDE
2646 else
2647 repsize += 2+5+1;
2648#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002649 else if (*p<100000)
2650 repsize += 2+5+1;
2651 else if (*p<1000000)
2652 repsize += 2+6+1;
2653 else
2654 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002655#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002656 }
2657 requiredsize = respos+repsize+(endp-collend);
2658 if (requiredsize > ressize) {
2659 if (requiredsize<2*ressize)
2660 requiredsize = 2*ressize;
2661 if (_PyString_Resize(&res, requiredsize))
2662 goto onError;
2663 str = PyString_AS_STRING(res) + respos;
2664 ressize = requiredsize;
2665 }
2666 /* generate replacement (temporarily (mis)uses p) */
2667 for (p = collstart; p < collend; ++p) {
2668 str += sprintf(str, "&#%d;", (int)*p);
2669 }
2670 p = collend;
2671 break;
2672 default:
2673 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2674 encoding, reason, startp, size, &exc,
2675 collstart-startp, collend-startp, &newpos);
2676 if (repunicode == NULL)
2677 goto onError;
2678 /* need more space? (at least enough for what we
2679 have+the replacement+the rest of the string, so
2680 we won't have to check space for encodable characters) */
2681 respos = str-PyString_AS_STRING(res);
2682 repsize = PyUnicode_GET_SIZE(repunicode);
2683 requiredsize = respos+repsize+(endp-collend);
2684 if (requiredsize > ressize) {
2685 if (requiredsize<2*ressize)
2686 requiredsize = 2*ressize;
2687 if (_PyString_Resize(&res, requiredsize)) {
2688 Py_DECREF(repunicode);
2689 goto onError;
2690 }
2691 str = PyString_AS_STRING(res) + respos;
2692 ressize = requiredsize;
2693 }
2694 /* check if there is anything unencodable in the replacement
2695 and copy it to the output */
2696 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2697 c = *uni2;
2698 if (c >= limit) {
2699 raise_encode_exception(&exc, encoding, startp, size,
2700 unicodepos, unicodepos+1, reason);
2701 Py_DECREF(repunicode);
2702 goto onError;
2703 }
2704 *str = (char)c;
2705 }
2706 p = startp + newpos;
2707 Py_DECREF(repunicode);
2708 }
2709 }
2710 }
2711 /* Resize if we allocated to much */
2712 respos = str-PyString_AS_STRING(res);
2713 if (respos<ressize)
2714 /* If this falls res will be NULL */
2715 _PyString_Resize(&res, respos);
2716 Py_XDECREF(errorHandler);
2717 Py_XDECREF(exc);
2718 return res;
2719
2720 onError:
2721 Py_XDECREF(res);
2722 Py_XDECREF(errorHandler);
2723 Py_XDECREF(exc);
2724 return NULL;
2725}
2726
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002728 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 const char *errors)
2730{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002731 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732}
2733
2734PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2735{
2736 if (!PyUnicode_Check(unicode)) {
2737 PyErr_BadArgument();
2738 return NULL;
2739 }
2740 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2741 PyUnicode_GET_SIZE(unicode),
2742 NULL);
2743}
2744
2745/* --- 7-bit ASCII Codec -------------------------------------------------- */
2746
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002748 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749 const char *errors)
2750{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002751 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752 PyUnicodeObject *v;
2753 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002754 Py_ssize_t startinpos;
2755 Py_ssize_t endinpos;
2756 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002757 const char *e;
2758 PyObject *errorHandler = NULL;
2759 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002760
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002762 if (size == 1 && *(unsigned char*)s < 128) {
2763 Py_UNICODE r = *(unsigned char*)s;
2764 return PyUnicode_FromUnicode(&r, 1);
2765 }
Tim Petersced69f82003-09-16 20:30:58 +00002766
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 v = _PyUnicode_New(size);
2768 if (v == NULL)
2769 goto onError;
2770 if (size == 0)
2771 return (PyObject *)v;
2772 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002773 e = s + size;
2774 while (s < e) {
2775 register unsigned char c = (unsigned char)*s;
2776 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002778 ++s;
2779 }
2780 else {
2781 startinpos = s-starts;
2782 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002783 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002784 if (unicode_decode_call_errorhandler(
2785 errors, &errorHandler,
2786 "ascii", "ordinal not in range(128)",
2787 starts, size, &startinpos, &endinpos, &exc, &s,
2788 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002792 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002793 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002794 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002795 Py_XDECREF(errorHandler);
2796 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002798
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 onError:
2800 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002801 Py_XDECREF(errorHandler);
2802 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 return NULL;
2804}
2805
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002807 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 const char *errors)
2809{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002810 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811}
2812
2813PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2814{
2815 if (!PyUnicode_Check(unicode)) {
2816 PyErr_BadArgument();
2817 return NULL;
2818 }
2819 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2820 PyUnicode_GET_SIZE(unicode),
2821 NULL);
2822}
2823
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002824#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002825
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002826/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002827
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002828PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002829 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002830 const char *errors)
2831{
2832 PyUnicodeObject *v;
2833 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002834 DWORD usize;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002835
2836 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002837 assert(size < INT_MAX);
2838 usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002839 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002840 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2841
2842 v = _PyUnicode_New(usize);
2843 if (v == NULL)
2844 return NULL;
2845 if (usize == 0)
2846 return (PyObject *)v;
2847 p = PyUnicode_AS_UNICODE(v);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002848 if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002849 Py_DECREF(v);
2850 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2851 }
2852
2853 return (PyObject *)v;
2854}
2855
2856PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002857 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002858 const char *errors)
2859{
2860 PyObject *repr;
2861 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002862 DWORD mbcssize;
2863
2864 /* If there are no characters, bail now! */
2865 if (size==0)
2866 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002867
2868 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002869 assert(size<INT_MAX);
2870 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002871 if (mbcssize==0)
2872 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2873
2874 repr = PyString_FromStringAndSize(NULL, mbcssize);
2875 if (repr == NULL)
2876 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002877 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002878 return repr;
2879
2880 /* Do the conversion */
2881 s = PyString_AS_STRING(repr);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002882 assert(size < INT_MAX);
2883 if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002884 Py_DECREF(repr);
2885 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2886 }
2887 return repr;
2888}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002889
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002890PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2891{
2892 if (!PyUnicode_Check(unicode)) {
2893 PyErr_BadArgument();
2894 return NULL;
2895 }
2896 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2897 PyUnicode_GET_SIZE(unicode),
2898 NULL);
2899}
2900
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002901#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002902
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903/* --- Character Mapping Codec -------------------------------------------- */
2904
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002906 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907 PyObject *mapping,
2908 const char *errors)
2909{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002910 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002911 Py_ssize_t startinpos;
2912 Py_ssize_t endinpos;
2913 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002914 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915 PyUnicodeObject *v;
2916 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002917 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002918 PyObject *errorHandler = NULL;
2919 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002920 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002921 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002922
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923 /* Default to Latin-1 */
2924 if (mapping == NULL)
2925 return PyUnicode_DecodeLatin1(s, size, errors);
2926
2927 v = _PyUnicode_New(size);
2928 if (v == NULL)
2929 goto onError;
2930 if (size == 0)
2931 return (PyObject *)v;
2932 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002933 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002934 if (PyUnicode_CheckExact(mapping)) {
2935 mapstring = PyUnicode_AS_UNICODE(mapping);
2936 maplen = PyUnicode_GET_SIZE(mapping);
2937 while (s < e) {
2938 unsigned char ch = *s;
2939 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002941 if (ch < maplen)
2942 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002944 if (x == 0xfffe) {
2945 /* undefined mapping */
2946 outpos = p-PyUnicode_AS_UNICODE(v);
2947 startinpos = s-starts;
2948 endinpos = startinpos+1;
2949 if (unicode_decode_call_errorhandler(
2950 errors, &errorHandler,
2951 "charmap", "character maps to <undefined>",
2952 starts, size, &startinpos, &endinpos, &exc, &s,
2953 (PyObject **)&v, &outpos, &p)) {
2954 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002955 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002956 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002957 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002958 *p++ = x;
2959 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002960 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002961 }
2962 else {
2963 while (s < e) {
2964 unsigned char ch = *s;
2965 PyObject *w, *x;
2966
2967 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2968 w = PyInt_FromLong((long)ch);
2969 if (w == NULL)
2970 goto onError;
2971 x = PyObject_GetItem(mapping, w);
2972 Py_DECREF(w);
2973 if (x == NULL) {
2974 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2975 /* No mapping found means: mapping is undefined. */
2976 PyErr_Clear();
2977 x = Py_None;
2978 Py_INCREF(x);
2979 } else
2980 goto onError;
2981 }
2982
2983 /* Apply mapping */
2984 if (PyInt_Check(x)) {
2985 long value = PyInt_AS_LONG(x);
2986 if (value < 0 || value > 65535) {
2987 PyErr_SetString(PyExc_TypeError,
2988 "character mapping must be in range(65536)");
2989 Py_DECREF(x);
2990 goto onError;
2991 }
2992 *p++ = (Py_UNICODE)value;
2993 }
2994 else if (x == Py_None) {
2995 /* undefined mapping */
2996 outpos = p-PyUnicode_AS_UNICODE(v);
2997 startinpos = s-starts;
2998 endinpos = startinpos+1;
2999 if (unicode_decode_call_errorhandler(
3000 errors, &errorHandler,
3001 "charmap", "character maps to <undefined>",
3002 starts, size, &startinpos, &endinpos, &exc, &s,
3003 (PyObject **)&v, &outpos, &p)) {
3004 Py_DECREF(x);
3005 goto onError;
3006 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003007 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003008 continue;
3009 }
3010 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003011 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003012
3013 if (targetsize == 1)
3014 /* 1-1 mapping */
3015 *p++ = *PyUnicode_AS_UNICODE(x);
3016
3017 else if (targetsize > 1) {
3018 /* 1-n mapping */
3019 if (targetsize > extrachars) {
3020 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003021 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3022 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003023 (targetsize << 2);
3024 extrachars += needed;
3025 if (_PyUnicode_Resize(&v,
3026 PyUnicode_GET_SIZE(v) + needed) < 0) {
3027 Py_DECREF(x);
3028 goto onError;
3029 }
3030 p = PyUnicode_AS_UNICODE(v) + oldpos;
3031 }
3032 Py_UNICODE_COPY(p,
3033 PyUnicode_AS_UNICODE(x),
3034 targetsize);
3035 p += targetsize;
3036 extrachars -= targetsize;
3037 }
3038 /* 1-0 mapping: skip the character */
3039 }
3040 else {
3041 /* wrong return value */
3042 PyErr_SetString(PyExc_TypeError,
3043 "character mapping must return integer, None or unicode");
3044 Py_DECREF(x);
3045 goto onError;
3046 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003048 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 }
3051 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003052 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003054 Py_XDECREF(errorHandler);
3055 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003057
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 Py_XDECREF(errorHandler);
3060 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 Py_XDECREF(v);
3062 return NULL;
3063}
3064
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003065/* Lookup the character ch in the mapping. If the character
3066 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003067 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003068static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 PyObject *w = PyInt_FromLong((long)c);
3071 PyObject *x;
3072
3073 if (w == NULL)
3074 return NULL;
3075 x = PyObject_GetItem(mapping, w);
3076 Py_DECREF(w);
3077 if (x == NULL) {
3078 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3079 /* No mapping found means: mapping is undefined. */
3080 PyErr_Clear();
3081 x = Py_None;
3082 Py_INCREF(x);
3083 return x;
3084 } else
3085 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003087 else if (x == Py_None)
3088 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003089 else if (PyInt_Check(x)) {
3090 long value = PyInt_AS_LONG(x);
3091 if (value < 0 || value > 255) {
3092 PyErr_SetString(PyExc_TypeError,
3093 "character mapping must be in range(256)");
3094 Py_DECREF(x);
3095 return NULL;
3096 }
3097 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003099 else if (PyString_Check(x))
3100 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003102 /* wrong return value */
3103 PyErr_SetString(PyExc_TypeError,
3104 "character mapping must return integer, None or str");
3105 Py_DECREF(x);
3106 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 }
3108}
3109
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003110/* lookup the character, put the result in the output string and adjust
3111 various state variables. Reallocate the output string if not enough
3112 space is available. Return a new reference to the object that
3113 was put in the output buffer, or Py_None, if the mapping was undefined
3114 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003115 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003116static
3117PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003118 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003119{
3120 PyObject *rep = charmapencode_lookup(c, mapping);
3121
3122 if (rep==NULL)
3123 return NULL;
3124 else if (rep==Py_None)
3125 return rep;
3126 else {
3127 char *outstart = PyString_AS_STRING(*outobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003128 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003129 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003130 Py_ssize_t requiredsize = *outpos+1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003131 if (outsize<requiredsize) {
3132 /* exponentially overallocate to minimize reallocations */
3133 if (requiredsize < 2*outsize)
3134 requiredsize = 2*outsize;
3135 if (_PyString_Resize(outobj, requiredsize)) {
3136 Py_DECREF(rep);
3137 return NULL;
3138 }
3139 outstart = PyString_AS_STRING(*outobj);
3140 }
3141 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3142 }
3143 else {
3144 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003145 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3146 Py_ssize_t requiredsize = *outpos+repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003147 if (outsize<requiredsize) {
3148 /* exponentially overallocate to minimize reallocations */
3149 if (requiredsize < 2*outsize)
3150 requiredsize = 2*outsize;
3151 if (_PyString_Resize(outobj, requiredsize)) {
3152 Py_DECREF(rep);
3153 return NULL;
3154 }
3155 outstart = PyString_AS_STRING(*outobj);
3156 }
3157 memcpy(outstart + *outpos, repchars, repsize);
3158 *outpos += repsize;
3159 }
3160 }
3161 return rep;
3162}
3163
3164/* handle an error in PyUnicode_EncodeCharmap
3165 Return 0 on success, -1 on error */
3166static
3167int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003168 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003169 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003170 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003171 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003172{
3173 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003174 Py_ssize_t repsize;
3175 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003176 Py_UNICODE *uni2;
3177 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003178 Py_ssize_t collstartpos = *inpos;
3179 Py_ssize_t collendpos = *inpos+1;
3180 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003181 char *encoding = "charmap";
3182 char *reason = "character maps to <undefined>";
3183
3184 PyObject *x;
3185 /* find all unencodable characters */
3186 while (collendpos < size) {
3187 x = charmapencode_lookup(p[collendpos], mapping);
3188 if (x==NULL)
3189 return -1;
3190 else if (x!=Py_None) {
3191 Py_DECREF(x);
3192 break;
3193 }
3194 Py_DECREF(x);
3195 ++collendpos;
3196 }
3197 /* cache callback name lookup
3198 * (if not done yet, i.e. it's the first error) */
3199 if (*known_errorHandler==-1) {
3200 if ((errors==NULL) || (!strcmp(errors, "strict")))
3201 *known_errorHandler = 1;
3202 else if (!strcmp(errors, "replace"))
3203 *known_errorHandler = 2;
3204 else if (!strcmp(errors, "ignore"))
3205 *known_errorHandler = 3;
3206 else if (!strcmp(errors, "xmlcharrefreplace"))
3207 *known_errorHandler = 4;
3208 else
3209 *known_errorHandler = 0;
3210 }
3211 switch (*known_errorHandler) {
3212 case 1: /* strict */
3213 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3214 return -1;
3215 case 2: /* replace */
3216 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3217 x = charmapencode_output('?', mapping, res, respos);
3218 if (x==NULL) {
3219 return -1;
3220 }
3221 else if (x==Py_None) {
3222 Py_DECREF(x);
3223 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3224 return -1;
3225 }
3226 Py_DECREF(x);
3227 }
3228 /* fall through */
3229 case 3: /* ignore */
3230 *inpos = collendpos;
3231 break;
3232 case 4: /* xmlcharrefreplace */
3233 /* generate replacement (temporarily (mis)uses p) */
3234 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3235 char buffer[2+29+1+1];
3236 char *cp;
3237 sprintf(buffer, "&#%d;", (int)p[collpos]);
3238 for (cp = buffer; *cp; ++cp) {
3239 x = charmapencode_output(*cp, mapping, res, respos);
3240 if (x==NULL)
3241 return -1;
3242 else if (x==Py_None) {
3243 Py_DECREF(x);
3244 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3245 return -1;
3246 }
3247 Py_DECREF(x);
3248 }
3249 }
3250 *inpos = collendpos;
3251 break;
3252 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003253 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003254 encoding, reason, p, size, exceptionObject,
3255 collstartpos, collendpos, &newpos);
3256 if (repunicode == NULL)
3257 return -1;
3258 /* generate replacement */
3259 repsize = PyUnicode_GET_SIZE(repunicode);
3260 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3261 x = charmapencode_output(*uni2, mapping, res, respos);
3262 if (x==NULL) {
3263 Py_DECREF(repunicode);
3264 return -1;
3265 }
3266 else if (x==Py_None) {
3267 Py_DECREF(repunicode);
3268 Py_DECREF(x);
3269 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3270 return -1;
3271 }
3272 Py_DECREF(x);
3273 }
3274 *inpos = newpos;
3275 Py_DECREF(repunicode);
3276 }
3277 return 0;
3278}
3279
Guido van Rossumd57fd912000-03-10 22:53:23 +00003280PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003281 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003282 PyObject *mapping,
3283 const char *errors)
3284{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003285 /* output object */
3286 PyObject *res = NULL;
3287 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003288 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003289 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003290 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003291 PyObject *errorHandler = NULL;
3292 PyObject *exc = NULL;
3293 /* the following variable is used for caching string comparisons
3294 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3295 * 3=ignore, 4=xmlcharrefreplace */
3296 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297
3298 /* Default to Latin-1 */
3299 if (mapping == NULL)
3300 return PyUnicode_EncodeLatin1(p, size, errors);
3301
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003302 /* allocate enough for a simple encoding without
3303 replacements, if we need more, we'll resize */
3304 res = PyString_FromStringAndSize(NULL, size);
3305 if (res == NULL)
3306 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003307 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003308 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003310 while (inpos<size) {
3311 /* try to encode it */
3312 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3313 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003315 if (x==Py_None) { /* unencodable character */
3316 if (charmap_encoding_error(p, size, &inpos, mapping,
3317 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003318 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003319 &res, &respos)) {
3320 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003321 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003322 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003324 else
3325 /* done with this character => adjust input position */
3326 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 Py_DECREF(x);
3328 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003329
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 /* Resize if we allocated to much */
3331 if (respos<PyString_GET_SIZE(res)) {
3332 if (_PyString_Resize(&res, respos))
3333 goto onError;
3334 }
3335 Py_XDECREF(exc);
3336 Py_XDECREF(errorHandler);
3337 return res;
3338
3339 onError:
3340 Py_XDECREF(res);
3341 Py_XDECREF(exc);
3342 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 return NULL;
3344}
3345
3346PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3347 PyObject *mapping)
3348{
3349 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3350 PyErr_BadArgument();
3351 return NULL;
3352 }
3353 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3354 PyUnicode_GET_SIZE(unicode),
3355 mapping,
3356 NULL);
3357}
3358
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003359/* create or adjust a UnicodeTranslateError */
3360static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003361 const Py_UNICODE *unicode, Py_ssize_t size,
3362 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003363 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003365 if (*exceptionObject == NULL) {
3366 *exceptionObject = PyUnicodeTranslateError_Create(
3367 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368 }
3369 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003370 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3371 goto onError;
3372 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3373 goto onError;
3374 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3375 goto onError;
3376 return;
3377 onError:
3378 Py_DECREF(*exceptionObject);
3379 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380 }
3381}
3382
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003383/* raises a UnicodeTranslateError */
3384static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003385 const Py_UNICODE *unicode, Py_ssize_t size,
3386 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003387 const char *reason)
3388{
3389 make_translate_exception(exceptionObject,
3390 unicode, size, startpos, endpos, reason);
3391 if (*exceptionObject != NULL)
3392 PyCodec_StrictErrors(*exceptionObject);
3393}
3394
3395/* error handling callback helper:
3396 build arguments, call the callback and check the arguments,
3397 put the result into newpos and return the replacement string, which
3398 has to be freed by the caller */
3399static PyObject *unicode_translate_call_errorhandler(const char *errors,
3400 PyObject **errorHandler,
3401 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003402 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3403 Py_ssize_t startpos, Py_ssize_t endpos,
3404 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003405{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003406 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003407
Martin v. Löwis412fb672006-04-13 06:34:32 +00003408 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003409 PyObject *restuple;
3410 PyObject *resunicode;
3411
3412 if (*errorHandler == NULL) {
3413 *errorHandler = PyCodec_LookupError(errors);
3414 if (*errorHandler == NULL)
3415 return NULL;
3416 }
3417
3418 make_translate_exception(exceptionObject,
3419 unicode, size, startpos, endpos, reason);
3420 if (*exceptionObject == NULL)
3421 return NULL;
3422
3423 restuple = PyObject_CallFunctionObjArgs(
3424 *errorHandler, *exceptionObject, NULL);
3425 if (restuple == NULL)
3426 return NULL;
3427 if (!PyTuple_Check(restuple)) {
3428 PyErr_Format(PyExc_TypeError, &argparse[4]);
3429 Py_DECREF(restuple);
3430 return NULL;
3431 }
3432 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003433 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003434 Py_DECREF(restuple);
3435 return NULL;
3436 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003437 if (i_newpos<0)
3438 *newpos = size+i_newpos;
3439 else
3440 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003441 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003442 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003443 Py_DECREF(restuple);
3444 return NULL;
3445 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003446 Py_INCREF(resunicode);
3447 Py_DECREF(restuple);
3448 return resunicode;
3449}
3450
3451/* Lookup the character ch in the mapping and put the result in result,
3452 which must be decrefed by the caller.
3453 Return 0 on success, -1 on error */
3454static
3455int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3456{
3457 PyObject *w = PyInt_FromLong((long)c);
3458 PyObject *x;
3459
3460 if (w == NULL)
3461 return -1;
3462 x = PyObject_GetItem(mapping, w);
3463 Py_DECREF(w);
3464 if (x == NULL) {
3465 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3466 /* No mapping found means: use 1:1 mapping. */
3467 PyErr_Clear();
3468 *result = NULL;
3469 return 0;
3470 } else
3471 return -1;
3472 }
3473 else if (x == Py_None) {
3474 *result = x;
3475 return 0;
3476 }
3477 else if (PyInt_Check(x)) {
3478 long value = PyInt_AS_LONG(x);
3479 long max = PyUnicode_GetMax();
3480 if (value < 0 || value > max) {
3481 PyErr_Format(PyExc_TypeError,
3482 "character mapping must be in range(0x%lx)", max+1);
3483 Py_DECREF(x);
3484 return -1;
3485 }
3486 *result = x;
3487 return 0;
3488 }
3489 else if (PyUnicode_Check(x)) {
3490 *result = x;
3491 return 0;
3492 }
3493 else {
3494 /* wrong return value */
3495 PyErr_SetString(PyExc_TypeError,
3496 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003497 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498 return -1;
3499 }
3500}
3501/* ensure that *outobj is at least requiredsize characters long,
3502if not reallocate and adjust various state variables.
3503Return 0 on success, -1 on error */
3504static
Walter Dörwald4894c302003-10-24 14:25:28 +00003505int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003506 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003508 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003509 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003510 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003511 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003513 if (requiredsize < 2 * oldsize)
3514 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003515 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003516 return -1;
3517 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 }
3519 return 0;
3520}
3521/* lookup the character, put the result in the output string and adjust
3522 various state variables. Return a new reference to the object that
3523 was put in the output buffer in *result, or Py_None, if the mapping was
3524 undefined (in which case no character was written).
3525 The called must decref result.
3526 Return 0 on success, -1 on error. */
3527static
Walter Dörwald4894c302003-10-24 14:25:28 +00003528int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003529 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003530 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531{
Walter Dörwald4894c302003-10-24 14:25:28 +00003532 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 return -1;
3534 if (*res==NULL) {
3535 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003536 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537 }
3538 else if (*res==Py_None)
3539 ;
3540 else if (PyInt_Check(*res)) {
3541 /* no overflow check, because we know that the space is enough */
3542 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3543 }
3544 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003545 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 if (repsize==1) {
3547 /* no overflow check, because we know that the space is enough */
3548 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3549 }
3550 else if (repsize!=0) {
3551 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003552 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003553 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003554 repsize - 1;
3555 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556 return -1;
3557 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3558 *outp += repsize;
3559 }
3560 }
3561 else
3562 return -1;
3563 return 0;
3564}
3565
3566PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003567 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003568 PyObject *mapping,
3569 const char *errors)
3570{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003571 /* output object */
3572 PyObject *res = NULL;
3573 /* pointers to the beginning and end+1 of input */
3574 const Py_UNICODE *startp = p;
3575 const Py_UNICODE *endp = p + size;
3576 /* pointer into the output */
3577 Py_UNICODE *str;
3578 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003579 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580 char *reason = "character maps to <undefined>";
3581 PyObject *errorHandler = NULL;
3582 PyObject *exc = NULL;
3583 /* the following variable is used for caching string comparisons
3584 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3585 * 3=ignore, 4=xmlcharrefreplace */
3586 int known_errorHandler = -1;
3587
Guido van Rossumd57fd912000-03-10 22:53:23 +00003588 if (mapping == NULL) {
3589 PyErr_BadArgument();
3590 return NULL;
3591 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592
3593 /* allocate enough for a simple 1:1 translation without
3594 replacements, if we need more, we'll resize */
3595 res = PyUnicode_FromUnicode(NULL, size);
3596 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003597 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003598 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 return res;
3600 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003601
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 while (p<endp) {
3603 /* try to encode it */
3604 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003605 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003606 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607 goto onError;
3608 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003609 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003610 if (x!=Py_None) /* it worked => adjust input pointer */
3611 ++p;
3612 else { /* untranslatable character */
3613 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003614 Py_ssize_t repsize;
3615 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003616 Py_UNICODE *uni2;
3617 /* startpos for collecting untranslatable chars */
3618 const Py_UNICODE *collstart = p;
3619 const Py_UNICODE *collend = p+1;
3620 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 /* find all untranslatable characters */
3623 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003624 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003625 goto onError;
3626 Py_XDECREF(x);
3627 if (x!=Py_None)
3628 break;
3629 ++collend;
3630 }
3631 /* cache callback name lookup
3632 * (if not done yet, i.e. it's the first error) */
3633 if (known_errorHandler==-1) {
3634 if ((errors==NULL) || (!strcmp(errors, "strict")))
3635 known_errorHandler = 1;
3636 else if (!strcmp(errors, "replace"))
3637 known_errorHandler = 2;
3638 else if (!strcmp(errors, "ignore"))
3639 known_errorHandler = 3;
3640 else if (!strcmp(errors, "xmlcharrefreplace"))
3641 known_errorHandler = 4;
3642 else
3643 known_errorHandler = 0;
3644 }
3645 switch (known_errorHandler) {
3646 case 1: /* strict */
3647 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3648 goto onError;
3649 case 2: /* replace */
3650 /* No need to check for space, this is a 1:1 replacement */
3651 for (coll = collstart; coll<collend; ++coll)
3652 *str++ = '?';
3653 /* fall through */
3654 case 3: /* ignore */
3655 p = collend;
3656 break;
3657 case 4: /* xmlcharrefreplace */
3658 /* generate replacement (temporarily (mis)uses p) */
3659 for (p = collstart; p < collend; ++p) {
3660 char buffer[2+29+1+1];
3661 char *cp;
3662 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003663 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003664 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3665 goto onError;
3666 for (cp = buffer; *cp; ++cp)
3667 *str++ = *cp;
3668 }
3669 p = collend;
3670 break;
3671 default:
3672 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3673 reason, startp, size, &exc,
3674 collstart-startp, collend-startp, &newpos);
3675 if (repunicode == NULL)
3676 goto onError;
3677 /* generate replacement */
3678 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003679 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3681 Py_DECREF(repunicode);
3682 goto onError;
3683 }
3684 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3685 *str++ = *uni2;
3686 p = startp + newpos;
3687 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688 }
3689 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003691 /* Resize if we allocated to much */
3692 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003693 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003694 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003695 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003696 }
3697 Py_XDECREF(exc);
3698 Py_XDECREF(errorHandler);
3699 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003701 onError:
3702 Py_XDECREF(res);
3703 Py_XDECREF(exc);
3704 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705 return NULL;
3706}
3707
3708PyObject *PyUnicode_Translate(PyObject *str,
3709 PyObject *mapping,
3710 const char *errors)
3711{
3712 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003713
Guido van Rossumd57fd912000-03-10 22:53:23 +00003714 str = PyUnicode_FromObject(str);
3715 if (str == NULL)
3716 goto onError;
3717 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3718 PyUnicode_GET_SIZE(str),
3719 mapping,
3720 errors);
3721 Py_DECREF(str);
3722 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003723
Guido van Rossumd57fd912000-03-10 22:53:23 +00003724 onError:
3725 Py_XDECREF(str);
3726 return NULL;
3727}
Tim Petersced69f82003-09-16 20:30:58 +00003728
Guido van Rossum9e896b32000-04-05 20:11:21 +00003729/* --- Decimal Encoder ---------------------------------------------------- */
3730
3731int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003732 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00003733 char *output,
3734 const char *errors)
3735{
3736 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003737 PyObject *errorHandler = NULL;
3738 PyObject *exc = NULL;
3739 const char *encoding = "decimal";
3740 const char *reason = "invalid decimal Unicode string";
3741 /* the following variable is used for caching string comparisons
3742 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3743 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003744
3745 if (output == NULL) {
3746 PyErr_BadArgument();
3747 return -1;
3748 }
3749
3750 p = s;
3751 end = s + length;
3752 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003753 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003754 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003755 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003756 Py_ssize_t repsize;
3757 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 Py_UNICODE *uni2;
3759 Py_UNICODE *collstart;
3760 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003761
Guido van Rossum9e896b32000-04-05 20:11:21 +00003762 if (Py_UNICODE_ISSPACE(ch)) {
3763 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003765 continue;
3766 }
3767 decimal = Py_UNICODE_TODECIMAL(ch);
3768 if (decimal >= 0) {
3769 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003770 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003771 continue;
3772 }
Guido van Rossumba477042000-04-06 18:18:10 +00003773 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003774 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003776 continue;
3777 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003778 /* All other characters are considered unencodable */
3779 collstart = p;
3780 collend = p+1;
3781 while (collend < end) {
3782 if ((0 < *collend && *collend < 256) ||
3783 !Py_UNICODE_ISSPACE(*collend) ||
3784 Py_UNICODE_TODECIMAL(*collend))
3785 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003786 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 /* cache callback name lookup
3788 * (if not done yet, i.e. it's the first error) */
3789 if (known_errorHandler==-1) {
3790 if ((errors==NULL) || (!strcmp(errors, "strict")))
3791 known_errorHandler = 1;
3792 else if (!strcmp(errors, "replace"))
3793 known_errorHandler = 2;
3794 else if (!strcmp(errors, "ignore"))
3795 known_errorHandler = 3;
3796 else if (!strcmp(errors, "xmlcharrefreplace"))
3797 known_errorHandler = 4;
3798 else
3799 known_errorHandler = 0;
3800 }
3801 switch (known_errorHandler) {
3802 case 1: /* strict */
3803 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3804 goto onError;
3805 case 2: /* replace */
3806 for (p = collstart; p < collend; ++p)
3807 *output++ = '?';
3808 /* fall through */
3809 case 3: /* ignore */
3810 p = collend;
3811 break;
3812 case 4: /* xmlcharrefreplace */
3813 /* generate replacement (temporarily (mis)uses p) */
3814 for (p = collstart; p < collend; ++p)
3815 output += sprintf(output, "&#%d;", (int)*p);
3816 p = collend;
3817 break;
3818 default:
3819 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3820 encoding, reason, s, length, &exc,
3821 collstart-s, collend-s, &newpos);
3822 if (repunicode == NULL)
3823 goto onError;
3824 /* generate replacement */
3825 repsize = PyUnicode_GET_SIZE(repunicode);
3826 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3827 Py_UNICODE ch = *uni2;
3828 if (Py_UNICODE_ISSPACE(ch))
3829 *output++ = ' ';
3830 else {
3831 decimal = Py_UNICODE_TODECIMAL(ch);
3832 if (decimal >= 0)
3833 *output++ = '0' + decimal;
3834 else if (0 < ch && ch < 256)
3835 *output++ = (char)ch;
3836 else {
3837 Py_DECREF(repunicode);
3838 raise_encode_exception(&exc, encoding,
3839 s, length, collstart-s, collend-s, reason);
3840 goto onError;
3841 }
3842 }
3843 }
3844 p = s + newpos;
3845 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003846 }
3847 }
3848 /* 0-terminate the output string */
3849 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003850 Py_XDECREF(exc);
3851 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003852 return 0;
3853
3854 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003855 Py_XDECREF(exc);
3856 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003857 return -1;
3858}
3859
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860/* --- Helpers ------------------------------------------------------------ */
3861
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003862#define USE_FAST /* experimental fast search implementation */
3863
3864/* fast search/count implementation, based on a mix between boyer-
3865 moore and horspool, with a few more bells and whistles on the top.
3866 for some more background, see: http://effbot.org/stringlib */
3867
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00003868/* note: fastsearch may access s[n], which isn't a problem when using
Fredrik Lundh0c71f882006-05-25 16:46:54 +00003869 Python's ordinary string types, but may cause problems if you're
3870 using this code in other contexts. also, the count mode returns -1
3871 if there cannot possible be a match in the target string, and 0 if
3872 it has actually checked for matches, but didn't find any. callers
3873 beware! */
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00003874
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003875#define FAST_COUNT 0
3876#define FAST_SEARCH 1
3877
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00003878LOCAL(Py_ssize_t)
3879fastsearch(Py_UNICODE* s, Py_ssize_t n, Py_UNICODE* p, Py_ssize_t m, int mode)
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003880{
3881 long mask;
3882 int skip, count = 0;
3883 Py_ssize_t i, j, mlast, w;
3884
3885 w = n - m;
3886
3887 if (w < 0)
3888 return -1;
3889
3890 /* look for special cases */
3891 if (m <= 1) {
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00003892 if (m <= 0)
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003893 return -1;
3894 /* use special case for 1-character strings */
3895 if (mode == FAST_COUNT) {
3896 for (i = 0; i < n; i++)
3897 if (s[i] == p[0])
3898 count++;
3899 return count;
3900 } else {
3901 for (i = 0; i < n; i++)
3902 if (s[i] == p[0])
3903 return i;
3904 }
3905 return -1;
3906 }
3907
3908 mlast = m - 1;
3909
3910 /* create compressed boyer-moore delta 1 table */
3911 skip = mlast - 1;
3912 /* process pattern[:-1] */
3913 for (mask = i = 0; i < mlast; i++) {
3914 mask |= (1 << (p[i] & 0x1F));
3915 if (p[i] == p[mlast])
3916 skip = mlast - i - 1;
3917 }
3918 /* process pattern[-1] outside the loop */
3919 mask |= (1 << (p[mlast] & 0x1F));
3920
3921 for (i = 0; i <= w; i++) {
3922 /* note: using mlast in the skip path slows things down on x86 */
3923 if (s[i+m-1] == p[m-1]) {
3924 /* candidate match */
3925 for (j = 0; j < mlast; j++)
3926 if (s[i+j] != p[j])
3927 break;
3928 if (j == mlast) {
3929 /* got a match! */
3930 if (mode != FAST_COUNT)
3931 return i;
3932 count++;
3933 i = i + mlast;
3934 continue;
3935 }
3936 /* miss: check if next character is part of pattern */
3937 if (!(mask & (1 << (s[i+m] & 0x1F))))
3938 i = i + m;
3939 else {
3940 i = i + skip;
3941 continue;
3942 }
3943 } else {
3944 /* skip: check if next character is part of pattern */
3945 if (!(mask & (1 << (s[i+m] & 0x1F))))
3946 i = i + m;
3947 }
3948 }
3949
3950 if (mode != FAST_COUNT)
3951 return -1;
3952 return count;
3953}
3954
3955LOCAL(Py_ssize_t) count(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003956 Py_ssize_t start,
3957 Py_ssize_t end,
3958 PyUnicodeObject *substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003960 Py_ssize_t count = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003962 if (start < 0)
3963 start += self->length;
3964 if (start < 0)
3965 start = 0;
3966 if (end > self->length)
3967 end = self->length;
3968 if (end < 0)
3969 end += self->length;
3970 if (end < 0)
3971 end = 0;
3972
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003973 if (substring->length == 0)
3974 return (end - start + 1);
3975
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003976#ifdef USE_FAST
3977 count = fastsearch(
3978 PyUnicode_AS_UNICODE(self) + start, end - start,
3979 substring->str, substring->length, FAST_COUNT
3980 );
3981 if (count < 0)
3982 count = 0; /* no match */
3983#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984 end -= substring->length;
3985
3986 while (start <= end)
3987 if (Py_UNICODE_MATCH(self, start, substring)) {
3988 count++;
3989 start += substring->length;
3990 } else
3991 start++;
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003992#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993
3994 return count;
3995}
3996
Martin v. Löwis18e16552006-02-15 17:27:45 +00003997Py_ssize_t PyUnicode_Count(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003999 Py_ssize_t start,
4000 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004001{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004002 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004003
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 str = PyUnicode_FromObject(str);
4005 if (str == NULL)
4006 return -1;
4007 substr = PyUnicode_FromObject(substr);
4008 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00004009 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 return -1;
4011 }
Tim Petersced69f82003-09-16 20:30:58 +00004012
Guido van Rossumd57fd912000-03-10 22:53:23 +00004013 result = count((PyUnicodeObject *)str,
4014 start, end,
4015 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00004016
Guido van Rossumd57fd912000-03-10 22:53:23 +00004017 Py_DECREF(str);
4018 Py_DECREF(substr);
4019 return result;
4020}
4021
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004022static Py_ssize_t findstring(PyUnicodeObject *self,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004024 Py_ssize_t start,
4025 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026 int direction)
4027{
4028 if (start < 0)
4029 start += self->length;
4030 if (start < 0)
4031 start = 0;
4032
Guido van Rossumd57fd912000-03-10 22:53:23 +00004033 if (end > self->length)
4034 end = self->length;
4035 if (end < 0)
4036 end += self->length;
4037 if (end < 0)
4038 end = 0;
4039
Guido van Rossum76afbd92002-08-20 17:29:29 +00004040 if (substring->length == 0)
4041 return (direction > 0) ? start : end;
4042
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004043#ifdef USE_FAST
4044 if (direction > 0) {
4045 Py_ssize_t pos = fastsearch(
4046 PyUnicode_AS_UNICODE(self) + start, end - start,
4047 substring->str, substring->length, FAST_SEARCH
4048 );
4049 if (pos < 0)
4050 return pos;
4051 return pos + start;
4052 }
4053#endif
4054
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055 end -= substring->length;
4056
4057 if (direction < 0) {
4058 for (; end >= start; end--)
4059 if (Py_UNICODE_MATCH(self, end, substring))
4060 return end;
4061 } else {
4062 for (; start <= end; start++)
4063 if (Py_UNICODE_MATCH(self, start, substring))
4064 return start;
4065 }
4066
4067 return -1;
4068}
4069
Martin v. Löwis18e16552006-02-15 17:27:45 +00004070Py_ssize_t PyUnicode_Find(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004072 Py_ssize_t start,
4073 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004074 int direction)
4075{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004076 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004077
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078 str = PyUnicode_FromObject(str);
4079 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004080 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004081 substr = PyUnicode_FromObject(substr);
4082 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004083 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004084 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085 }
Tim Petersced69f82003-09-16 20:30:58 +00004086
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087 result = findstring((PyUnicodeObject *)str,
4088 (PyUnicodeObject *)substr,
4089 start, end, direction);
4090 Py_DECREF(str);
4091 Py_DECREF(substr);
4092 return result;
4093}
4094
Tim Petersced69f82003-09-16 20:30:58 +00004095static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096int tailmatch(PyUnicodeObject *self,
4097 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004098 Py_ssize_t start,
4099 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100 int direction)
4101{
4102 if (start < 0)
4103 start += self->length;
4104 if (start < 0)
4105 start = 0;
4106
4107 if (substring->length == 0)
4108 return 1;
4109
4110 if (end > self->length)
4111 end = self->length;
4112 if (end < 0)
4113 end += self->length;
4114 if (end < 0)
4115 end = 0;
4116
4117 end -= substring->length;
4118 if (end < start)
4119 return 0;
4120
4121 if (direction > 0) {
4122 if (Py_UNICODE_MATCH(self, end, substring))
4123 return 1;
4124 } else {
4125 if (Py_UNICODE_MATCH(self, start, substring))
4126 return 1;
4127 }
4128
4129 return 0;
4130}
4131
Martin v. Löwis18e16552006-02-15 17:27:45 +00004132Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004134 Py_ssize_t start,
4135 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136 int direction)
4137{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004138 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004139
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 str = PyUnicode_FromObject(str);
4141 if (str == NULL)
4142 return -1;
4143 substr = PyUnicode_FromObject(substr);
4144 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004145 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146 return -1;
4147 }
Tim Petersced69f82003-09-16 20:30:58 +00004148
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149 result = tailmatch((PyUnicodeObject *)str,
4150 (PyUnicodeObject *)substr,
4151 start, end, direction);
4152 Py_DECREF(str);
4153 Py_DECREF(substr);
4154 return result;
4155}
4156
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157/* Apply fixfct filter to the Unicode object self and return a
4158 reference to the modified object */
4159
Tim Petersced69f82003-09-16 20:30:58 +00004160static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004161PyObject *fixup(PyUnicodeObject *self,
4162 int (*fixfct)(PyUnicodeObject *s))
4163{
4164
4165 PyUnicodeObject *u;
4166
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004167 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168 if (u == NULL)
4169 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004170
4171 Py_UNICODE_COPY(u->str, self->str, self->length);
4172
Tim Peters7a29bd52001-09-12 03:03:31 +00004173 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004174 /* fixfct should return TRUE if it modified the buffer. If
4175 FALSE, return a reference to the original buffer instead
4176 (to save space, not time) */
4177 Py_INCREF(self);
4178 Py_DECREF(u);
4179 return (PyObject*) self;
4180 }
4181 return (PyObject*) u;
4182}
4183
Tim Petersced69f82003-09-16 20:30:58 +00004184static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185int fixupper(PyUnicodeObject *self)
4186{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004187 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188 Py_UNICODE *s = self->str;
4189 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004190
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191 while (len-- > 0) {
4192 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004193
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194 ch = Py_UNICODE_TOUPPER(*s);
4195 if (ch != *s) {
4196 status = 1;
4197 *s = ch;
4198 }
4199 s++;
4200 }
4201
4202 return status;
4203}
4204
Tim Petersced69f82003-09-16 20:30:58 +00004205static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004206int fixlower(PyUnicodeObject *self)
4207{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004208 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209 Py_UNICODE *s = self->str;
4210 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004211
Guido van Rossumd57fd912000-03-10 22:53:23 +00004212 while (len-- > 0) {
4213 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004214
Guido van Rossumd57fd912000-03-10 22:53:23 +00004215 ch = Py_UNICODE_TOLOWER(*s);
4216 if (ch != *s) {
4217 status = 1;
4218 *s = ch;
4219 }
4220 s++;
4221 }
4222
4223 return status;
4224}
4225
Tim Petersced69f82003-09-16 20:30:58 +00004226static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004227int fixswapcase(PyUnicodeObject *self)
4228{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004229 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004230 Py_UNICODE *s = self->str;
4231 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004232
Guido van Rossumd57fd912000-03-10 22:53:23 +00004233 while (len-- > 0) {
4234 if (Py_UNICODE_ISUPPER(*s)) {
4235 *s = Py_UNICODE_TOLOWER(*s);
4236 status = 1;
4237 } else if (Py_UNICODE_ISLOWER(*s)) {
4238 *s = Py_UNICODE_TOUPPER(*s);
4239 status = 1;
4240 }
4241 s++;
4242 }
4243
4244 return status;
4245}
4246
Tim Petersced69f82003-09-16 20:30:58 +00004247static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004248int fixcapitalize(PyUnicodeObject *self)
4249{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004250 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004251 Py_UNICODE *s = self->str;
4252 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004253
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004254 if (len == 0)
4255 return 0;
4256 if (Py_UNICODE_ISLOWER(*s)) {
4257 *s = Py_UNICODE_TOUPPER(*s);
4258 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004259 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004260 s++;
4261 while (--len > 0) {
4262 if (Py_UNICODE_ISUPPER(*s)) {
4263 *s = Py_UNICODE_TOLOWER(*s);
4264 status = 1;
4265 }
4266 s++;
4267 }
4268 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004269}
4270
4271static
4272int fixtitle(PyUnicodeObject *self)
4273{
4274 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4275 register Py_UNICODE *e;
4276 int previous_is_cased;
4277
4278 /* Shortcut for single character strings */
4279 if (PyUnicode_GET_SIZE(self) == 1) {
4280 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4281 if (*p != ch) {
4282 *p = ch;
4283 return 1;
4284 }
4285 else
4286 return 0;
4287 }
Tim Petersced69f82003-09-16 20:30:58 +00004288
Guido van Rossumd57fd912000-03-10 22:53:23 +00004289 e = p + PyUnicode_GET_SIZE(self);
4290 previous_is_cased = 0;
4291 for (; p < e; p++) {
4292 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004293
Guido van Rossumd57fd912000-03-10 22:53:23 +00004294 if (previous_is_cased)
4295 *p = Py_UNICODE_TOLOWER(ch);
4296 else
4297 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004298
4299 if (Py_UNICODE_ISLOWER(ch) ||
4300 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004301 Py_UNICODE_ISTITLE(ch))
4302 previous_is_cased = 1;
4303 else
4304 previous_is_cased = 0;
4305 }
4306 return 1;
4307}
4308
Tim Peters8ce9f162004-08-27 01:49:32 +00004309PyObject *
4310PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311{
Tim Peters8ce9f162004-08-27 01:49:32 +00004312 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004313 const Py_UNICODE blank = ' ';
4314 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004315 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004316 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004317 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4318 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004319 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4320 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004321 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004322 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004323 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324
Tim Peters05eba1f2004-08-27 21:32:02 +00004325 fseq = PySequence_Fast(seq, "");
4326 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004327 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004328 }
4329
Tim Peters91879ab2004-08-27 22:35:44 +00004330 /* Grrrr. A codec may be invoked to convert str objects to
4331 * Unicode, and so it's possible to call back into Python code
4332 * during PyUnicode_FromObject(), and so it's possible for a sick
4333 * codec to change the size of fseq (if seq is a list). Therefore
4334 * we have to keep refetching the size -- can't assume seqlen
4335 * is invariant.
4336 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004337 seqlen = PySequence_Fast_GET_SIZE(fseq);
4338 /* If empty sequence, return u"". */
4339 if (seqlen == 0) {
4340 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4341 goto Done;
4342 }
4343 /* If singleton sequence with an exact Unicode, return that. */
4344 if (seqlen == 1) {
4345 item = PySequence_Fast_GET_ITEM(fseq, 0);
4346 if (PyUnicode_CheckExact(item)) {
4347 Py_INCREF(item);
4348 res = (PyUnicodeObject *)item;
4349 goto Done;
4350 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004351 }
4352
Tim Peters05eba1f2004-08-27 21:32:02 +00004353 /* At least two items to join, or one that isn't exact Unicode. */
4354 if (seqlen > 1) {
4355 /* Set up sep and seplen -- they're needed. */
4356 if (separator == NULL) {
4357 sep = &blank;
4358 seplen = 1;
4359 }
4360 else {
4361 internal_separator = PyUnicode_FromObject(separator);
4362 if (internal_separator == NULL)
4363 goto onError;
4364 sep = PyUnicode_AS_UNICODE(internal_separator);
4365 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004366 /* In case PyUnicode_FromObject() mutated seq. */
4367 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004368 }
4369 }
4370
4371 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004372 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004373 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004374 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004375 res_p = PyUnicode_AS_UNICODE(res);
4376 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004377
Tim Peters05eba1f2004-08-27 21:32:02 +00004378 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004379 Py_ssize_t itemlen;
4380 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004381
4382 item = PySequence_Fast_GET_ITEM(fseq, i);
4383 /* Convert item to Unicode. */
4384 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4385 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004386 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004387 " %.80s found",
4388 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004389 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004390 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004391 item = PyUnicode_FromObject(item);
4392 if (item == NULL)
4393 goto onError;
4394 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004395
Tim Peters91879ab2004-08-27 22:35:44 +00004396 /* In case PyUnicode_FromObject() mutated seq. */
4397 seqlen = PySequence_Fast_GET_SIZE(fseq);
4398
Tim Peters8ce9f162004-08-27 01:49:32 +00004399 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004401 new_res_used = res_used + itemlen;
Tim Peters286085c2006-05-22 19:17:04 +00004402 if (new_res_used <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004403 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004404 if (i < seqlen - 1) {
4405 new_res_used += seplen;
Tim Peters286085c2006-05-22 19:17:04 +00004406 if (new_res_used <= 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004407 goto Overflow;
4408 }
4409 if (new_res_used > res_alloc) {
4410 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004411 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004412 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004413 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004414 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004415 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004416 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004417 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004418 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004419 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004420 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004422
4423 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004424 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004425 res_p += itemlen;
4426 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004427 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004428 res_p += seplen;
4429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004430 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004431 res_used = new_res_used;
4432 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004433
Tim Peters05eba1f2004-08-27 21:32:02 +00004434 /* Shrink res to match the used area; this probably can't fail,
4435 * but it's cheap to check.
4436 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004437 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004438 goto onError;
4439
4440 Done:
4441 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004442 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443 return (PyObject *)res;
4444
Tim Peters8ce9f162004-08-27 01:49:32 +00004445 Overflow:
4446 PyErr_SetString(PyExc_OverflowError,
4447 "join() is too long for a Python string");
4448 Py_DECREF(item);
4449 /* fall through */
4450
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004452 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004453 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004454 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 return NULL;
4456}
4457
Tim Petersced69f82003-09-16 20:30:58 +00004458static
4459PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004460 Py_ssize_t left,
4461 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004462 Py_UNICODE fill)
4463{
4464 PyUnicodeObject *u;
4465
4466 if (left < 0)
4467 left = 0;
4468 if (right < 0)
4469 right = 0;
4470
Tim Peters7a29bd52001-09-12 03:03:31 +00004471 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 Py_INCREF(self);
4473 return self;
4474 }
4475
4476 u = _PyUnicode_New(left + self->length + right);
4477 if (u) {
4478 if (left)
4479 Py_UNICODE_FILL(u->str, fill, left);
4480 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4481 if (right)
4482 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4483 }
4484
4485 return u;
4486}
4487
4488#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004489 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 if (!str) \
4491 goto onError; \
4492 if (PyList_Append(list, str)) { \
4493 Py_DECREF(str); \
4494 goto onError; \
4495 } \
4496 else \
4497 Py_DECREF(str);
4498
4499static
4500PyObject *split_whitespace(PyUnicodeObject *self,
4501 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004502 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004504 register Py_ssize_t i;
4505 register Py_ssize_t j;
4506 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507 PyObject *str;
4508
4509 for (i = j = 0; i < len; ) {
4510 /* find a token */
4511 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4512 i++;
4513 j = i;
4514 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4515 i++;
4516 if (j < i) {
4517 if (maxcount-- <= 0)
4518 break;
4519 SPLIT_APPEND(self->str, j, i);
4520 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4521 i++;
4522 j = i;
4523 }
4524 }
4525 if (j < len) {
4526 SPLIT_APPEND(self->str, j, len);
4527 }
4528 return list;
4529
4530 onError:
4531 Py_DECREF(list);
4532 return NULL;
4533}
4534
4535PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004536 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004538 register Py_ssize_t i;
4539 register Py_ssize_t j;
4540 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541 PyObject *list;
4542 PyObject *str;
4543 Py_UNICODE *data;
4544
4545 string = PyUnicode_FromObject(string);
4546 if (string == NULL)
4547 return NULL;
4548 data = PyUnicode_AS_UNICODE(string);
4549 len = PyUnicode_GET_SIZE(string);
4550
Guido van Rossumd57fd912000-03-10 22:53:23 +00004551 list = PyList_New(0);
4552 if (!list)
4553 goto onError;
4554
4555 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004556 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004557
Guido van Rossumd57fd912000-03-10 22:53:23 +00004558 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004559 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561
4562 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004563 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004564 if (i < len) {
4565 if (data[i] == '\r' && i + 1 < len &&
4566 data[i+1] == '\n')
4567 i += 2;
4568 else
4569 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004570 if (keepends)
4571 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572 }
Guido van Rossum86662912000-04-11 15:38:46 +00004573 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574 j = i;
4575 }
4576 if (j < len) {
4577 SPLIT_APPEND(data, j, len);
4578 }
4579
4580 Py_DECREF(string);
4581 return list;
4582
4583 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004584 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585 Py_DECREF(string);
4586 return NULL;
4587}
4588
Tim Petersced69f82003-09-16 20:30:58 +00004589static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590PyObject *split_char(PyUnicodeObject *self,
4591 PyObject *list,
4592 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004593 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004594{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004595 register Py_ssize_t i;
4596 register Py_ssize_t j;
4597 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598 PyObject *str;
4599
4600 for (i = j = 0; i < len; ) {
4601 if (self->str[i] == ch) {
4602 if (maxcount-- <= 0)
4603 break;
4604 SPLIT_APPEND(self->str, j, i);
4605 i = j = i + 1;
4606 } else
4607 i++;
4608 }
4609 if (j <= len) {
4610 SPLIT_APPEND(self->str, j, len);
4611 }
4612 return list;
4613
4614 onError:
4615 Py_DECREF(list);
4616 return NULL;
4617}
4618
Tim Petersced69f82003-09-16 20:30:58 +00004619static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004620PyObject *split_substring(PyUnicodeObject *self,
4621 PyObject *list,
4622 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004623 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004625 register Py_ssize_t i;
4626 register Py_ssize_t j;
4627 Py_ssize_t len = self->length;
4628 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004629 PyObject *str;
4630
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004631 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632 if (Py_UNICODE_MATCH(self, i, substring)) {
4633 if (maxcount-- <= 0)
4634 break;
4635 SPLIT_APPEND(self->str, j, i);
4636 i = j = i + sublen;
4637 } else
4638 i++;
4639 }
4640 if (j <= len) {
4641 SPLIT_APPEND(self->str, j, len);
4642 }
4643 return list;
4644
4645 onError:
4646 Py_DECREF(list);
4647 return NULL;
4648}
4649
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004650static
4651PyObject *rsplit_whitespace(PyUnicodeObject *self,
4652 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004653 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004654{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004655 register Py_ssize_t i;
4656 register Py_ssize_t j;
4657 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004658 PyObject *str;
4659
4660 for (i = j = len - 1; i >= 0; ) {
4661 /* find a token */
4662 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4663 i--;
4664 j = i;
4665 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4666 i--;
4667 if (j > i) {
4668 if (maxcount-- <= 0)
4669 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004670 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004671 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4672 i--;
4673 j = i;
4674 }
4675 }
4676 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004677 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004678 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004679 if (PyList_Reverse(list) < 0)
4680 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004681 return list;
4682
4683 onError:
4684 Py_DECREF(list);
4685 return NULL;
4686}
4687
4688static
4689PyObject *rsplit_char(PyUnicodeObject *self,
4690 PyObject *list,
4691 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004692 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004693{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004694 register Py_ssize_t i;
4695 register Py_ssize_t j;
4696 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004697 PyObject *str;
4698
4699 for (i = j = len - 1; i >= 0; ) {
4700 if (self->str[i] == ch) {
4701 if (maxcount-- <= 0)
4702 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004703 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004704 j = i = i - 1;
4705 } else
4706 i--;
4707 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004708 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004709 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004710 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004711 if (PyList_Reverse(list) < 0)
4712 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004713 return list;
4714
4715 onError:
4716 Py_DECREF(list);
4717 return NULL;
4718}
4719
4720static
4721PyObject *rsplit_substring(PyUnicodeObject *self,
4722 PyObject *list,
4723 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004724 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004725{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004726 register Py_ssize_t i;
4727 register Py_ssize_t j;
4728 Py_ssize_t len = self->length;
4729 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004730 PyObject *str;
4731
4732 for (i = len - sublen, j = len; i >= 0; ) {
4733 if (Py_UNICODE_MATCH(self, i, substring)) {
4734 if (maxcount-- <= 0)
4735 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004736 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004737 j = i;
4738 i -= sublen;
4739 } else
4740 i--;
4741 }
4742 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004743 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004744 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004745 if (PyList_Reverse(list) < 0)
4746 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004747 return list;
4748
4749 onError:
4750 Py_DECREF(list);
4751 return NULL;
4752}
4753
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754#undef SPLIT_APPEND
4755
4756static
4757PyObject *split(PyUnicodeObject *self,
4758 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004759 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760{
4761 PyObject *list;
4762
4763 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004764 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765
4766 list = PyList_New(0);
4767 if (!list)
4768 return NULL;
4769
4770 if (substring == NULL)
4771 return split_whitespace(self,list,maxcount);
4772
4773 else if (substring->length == 1)
4774 return split_char(self,list,substring->str[0],maxcount);
4775
4776 else if (substring->length == 0) {
4777 Py_DECREF(list);
4778 PyErr_SetString(PyExc_ValueError, "empty separator");
4779 return NULL;
4780 }
4781 else
4782 return split_substring(self,list,substring,maxcount);
4783}
4784
Tim Petersced69f82003-09-16 20:30:58 +00004785static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004786PyObject *rsplit(PyUnicodeObject *self,
4787 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004788 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004789{
4790 PyObject *list;
4791
4792 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004793 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004794
4795 list = PyList_New(0);
4796 if (!list)
4797 return NULL;
4798
4799 if (substring == NULL)
4800 return rsplit_whitespace(self,list,maxcount);
4801
4802 else if (substring->length == 1)
4803 return rsplit_char(self,list,substring->str[0],maxcount);
4804
4805 else if (substring->length == 0) {
4806 Py_DECREF(list);
4807 PyErr_SetString(PyExc_ValueError, "empty separator");
4808 return NULL;
4809 }
4810 else
4811 return rsplit_substring(self,list,substring,maxcount);
4812}
4813
4814static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815PyObject *replace(PyUnicodeObject *self,
4816 PyUnicodeObject *str1,
4817 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004818 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819{
4820 PyUnicodeObject *u;
4821
4822 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004823 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824
Fredrik Lundh347ee272006-05-24 16:35:18 +00004825 if (str1->length == str2->length) {
4826 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004827 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004828 if (str1->length == 1) {
4829 /* replace characters */
4830 Py_UNICODE u1, u2;
4831 if (!findchar(self->str, self->length, str1->str[0]))
4832 goto nothing;
4833 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4834 if (!u)
4835 return NULL;
4836 Py_UNICODE_COPY(u->str, self->str, self->length);
4837 u1 = str1->str[0];
4838 u2 = str2->str[0];
4839 for (i = 0; i < u->length; i++)
4840 if (u->str[i] == u1) {
4841 if (--maxcount < 0)
4842 break;
4843 u->str[i] = u2;
4844 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00004846 i = fastsearch(
4847 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00004849 if (i < 0)
4850 goto nothing;
4851 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4852 if (!u)
4853 return NULL;
4854 Py_UNICODE_COPY(u->str, self->str, self->length);
4855 while (i <= self->length - str1->length)
4856 if (Py_UNICODE_MATCH(self, i, str1)) {
4857 if (--maxcount < 0)
4858 break;
4859 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
4860 i += str1->length;
4861 } else
4862 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00004865
Martin v. Löwis18e16552006-02-15 17:27:45 +00004866 Py_ssize_t n, i;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00004867 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868 Py_UNICODE *p;
4869
4870 /* replace strings */
4871 n = count(self, 0, self->length, str1);
4872 if (n > maxcount)
4873 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004874 if (n == 0)
4875 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00004876 /* new_size = self->length + n * (str2->length - str1->length)); */
4877 delta = (str2->length - str1->length);
4878 if (delta == 0) {
4879 new_size = self->length;
4880 } else {
4881 product = n * (str2->length - str1->length);
4882 if ((product / (str2->length - str1->length)) != n) {
4883 PyErr_SetString(PyExc_OverflowError,
4884 "replace string is too long");
4885 return NULL;
4886 }
4887 new_size = self->length + product;
4888 if (new_size < 0) {
4889 PyErr_SetString(PyExc_OverflowError,
4890 "replace string is too long");
4891 return NULL;
4892 }
4893 }
4894 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00004895 if (!u)
4896 return NULL;
4897 i = 0;
4898 p = u->str;
4899 if (str1->length > 0) {
4900 while (i <= self->length - str1->length)
4901 if (Py_UNICODE_MATCH(self, i, str1)) {
4902 /* replace string segment */
4903 Py_UNICODE_COPY(p, str2->str, str2->length);
4904 p += str2->length;
4905 i += str1->length;
4906 if (--n <= 0) {
4907 /* copy remaining part */
4908 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4909 break;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004910 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00004911 } else
4912 *p++ = self->str[i++];
4913 } else {
4914 while (n > 0) {
4915 Py_UNICODE_COPY(p, str2->str, str2->length);
4916 p += str2->length;
4917 if (--n <= 0)
4918 break;
4919 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00004921 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922 }
4923 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004925
4926nothing:
4927 /* nothing to replace; return original string (when possible) */
4928 if (PyUnicode_CheckExact(self)) {
4929 Py_INCREF(self);
4930 return (PyObject *) self;
4931 }
4932 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933}
4934
4935/* --- Unicode Object Methods --------------------------------------------- */
4936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004937PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938"S.title() -> unicode\n\
4939\n\
4940Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004941characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004942
4943static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004944unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946 return fixup(self, fixtitle);
4947}
4948
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004949PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950"S.capitalize() -> unicode\n\
4951\n\
4952Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004953have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954
4955static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004956unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958 return fixup(self, fixcapitalize);
4959}
4960
4961#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004962PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963"S.capwords() -> unicode\n\
4964\n\
4965Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004966normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004967
4968static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004969unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970{
4971 PyObject *list;
4972 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004973 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975 /* Split into words */
4976 list = split(self, NULL, -1);
4977 if (!list)
4978 return NULL;
4979
4980 /* Capitalize each word */
4981 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4982 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4983 fixcapitalize);
4984 if (item == NULL)
4985 goto onError;
4986 Py_DECREF(PyList_GET_ITEM(list, i));
4987 PyList_SET_ITEM(list, i, item);
4988 }
4989
4990 /* Join the words to form a new string */
4991 item = PyUnicode_Join(NULL, list);
4992
4993onError:
4994 Py_DECREF(list);
4995 return (PyObject *)item;
4996}
4997#endif
4998
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004999/* Argument converter. Coerces to a single unicode character */
5000
5001static int
5002convert_uc(PyObject *obj, void *addr)
5003{
5004 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5005 PyObject *uniobj;
5006 Py_UNICODE *unistr;
5007
5008 uniobj = PyUnicode_FromObject(obj);
5009 if (uniobj == NULL) {
5010 PyErr_SetString(PyExc_TypeError,
5011 "The fill character cannot be converted to Unicode");
5012 return 0;
5013 }
5014 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5015 PyErr_SetString(PyExc_TypeError,
5016 "The fill character must be exactly one character long");
5017 Py_DECREF(uniobj);
5018 return 0;
5019 }
5020 unistr = PyUnicode_AS_UNICODE(uniobj);
5021 *fillcharloc = unistr[0];
5022 Py_DECREF(uniobj);
5023 return 1;
5024}
5025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005026PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005027"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005029Return S centered in a Unicode string of length width. Padding is\n\
5030done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005031
5032static PyObject *
5033unicode_center(PyUnicodeObject *self, PyObject *args)
5034{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005035 Py_ssize_t marg, left;
5036 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005037 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005038
Thomas Woutersde017742006-02-16 19:34:37 +00005039 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005040 return NULL;
5041
Tim Peters7a29bd52001-09-12 03:03:31 +00005042 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043 Py_INCREF(self);
5044 return (PyObject*) self;
5045 }
5046
5047 marg = width - self->length;
5048 left = marg / 2 + (marg & width & 1);
5049
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005050 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005051}
5052
Marc-André Lemburge5034372000-08-08 08:04:29 +00005053#if 0
5054
5055/* This code should go into some future Unicode collation support
5056 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005057 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005058
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005059/* speedy UTF-16 code point order comparison */
5060/* gleaned from: */
5061/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5062
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005063static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005064{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005065 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005066 0, 0, 0, 0, 0, 0, 0, 0,
5067 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005068 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005069};
5070
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071static int
5072unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5073{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005074 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005075
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076 Py_UNICODE *s1 = str1->str;
5077 Py_UNICODE *s2 = str2->str;
5078
5079 len1 = str1->length;
5080 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005081
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005083 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005084
5085 c1 = *s1++;
5086 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005087
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005088 if (c1 > (1<<11) * 26)
5089 c1 += utf16Fixup[c1>>11];
5090 if (c2 > (1<<11) * 26)
5091 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005092 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005093
5094 if (c1 != c2)
5095 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005096
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005097 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 }
5099
5100 return (len1 < len2) ? -1 : (len1 != len2);
5101}
5102
Marc-André Lemburge5034372000-08-08 08:04:29 +00005103#else
5104
5105static int
5106unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5107{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005108 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005109
5110 Py_UNICODE *s1 = str1->str;
5111 Py_UNICODE *s2 = str2->str;
5112
5113 len1 = str1->length;
5114 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005115
Marc-André Lemburge5034372000-08-08 08:04:29 +00005116 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005117 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005118
Fredrik Lundh45714e92001-06-26 16:39:36 +00005119 c1 = *s1++;
5120 c2 = *s2++;
5121
5122 if (c1 != c2)
5123 return (c1 < c2) ? -1 : 1;
5124
Marc-André Lemburge5034372000-08-08 08:04:29 +00005125 len1--; len2--;
5126 }
5127
5128 return (len1 < len2) ? -1 : (len1 != len2);
5129}
5130
5131#endif
5132
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133int PyUnicode_Compare(PyObject *left,
5134 PyObject *right)
5135{
5136 PyUnicodeObject *u = NULL, *v = NULL;
5137 int result;
5138
5139 /* Coerce the two arguments */
5140 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5141 if (u == NULL)
5142 goto onError;
5143 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5144 if (v == NULL)
5145 goto onError;
5146
Thomas Wouters7e474022000-07-16 12:04:32 +00005147 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148 if (v == u) {
5149 Py_DECREF(u);
5150 Py_DECREF(v);
5151 return 0;
5152 }
5153
5154 result = unicode_compare(u, v);
5155
5156 Py_DECREF(u);
5157 Py_DECREF(v);
5158 return result;
5159
5160onError:
5161 Py_XDECREF(u);
5162 Py_XDECREF(v);
5163 return -1;
5164}
5165
Guido van Rossum403d68b2000-03-13 15:55:09 +00005166int PyUnicode_Contains(PyObject *container,
5167 PyObject *element)
5168{
Fredrik Lundh833bf942006-05-23 10:12:21 +00005169 PyUnicodeObject *u, *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005170 int result;
5171 Py_ssize_t size;
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00005172#ifdef USE_FAST
5173 Py_ssize_t pos;
5174#endif
Guido van Rossum403d68b2000-03-13 15:55:09 +00005175
5176 /* Coerce the two arguments */
Fredrik Lundh833bf942006-05-23 10:12:21 +00005177 v = (PyUnicodeObject *) PyUnicode_FromObject(element);
5178 if (!v) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005179 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005180 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005181 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005182 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005183
5184 u = (PyUnicodeObject *) PyUnicode_FromObject(container);
5185 if (!u) {
5186 Py_DECREF(v);
5187 return -1;
5188 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005189
Barry Warsaw817918c2002-08-06 16:58:21 +00005190 size = PyUnicode_GET_SIZE(v);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005191 if (!size) {
5192 result = 1;
5193 goto done;
5194 }
Barry Warsaw817918c2002-08-06 16:58:21 +00005195
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00005196#ifdef USE_FAST
5197 pos = fastsearch(
5198 PyUnicode_AS_UNICODE(u), PyUnicode_GET_SIZE(u),
5199 PyUnicode_AS_UNICODE(v), size, FAST_SEARCH
5200 );
5201 result = (pos != -1);
5202#else
Guido van Rossum403d68b2000-03-13 15:55:09 +00005203 result = 0;
Fredrik Lundh833bf942006-05-23 10:12:21 +00005204
Barry Warsaw817918c2002-08-06 16:58:21 +00005205 if (size == 1) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00005206 Py_UNICODE chr = PyUnicode_AS_UNICODE(v)[0];
5207 Py_UNICODE* ptr = PyUnicode_AS_UNICODE(u);
5208 Py_UNICODE* end = ptr + PyUnicode_GET_SIZE(u);
5209 for (; ptr < end; ptr++) {
5210 if (*ptr == chr) {
Barry Warsaw817918c2002-08-06 16:58:21 +00005211 result = 1;
5212 break;
5213 }
5214 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005215 } else {
Fredrik Lundh240bf2a2006-05-24 10:20:36 +00005216 Py_ssize_t start = 0;
5217 Py_ssize_t end = PyUnicode_GET_SIZE(u) - size;
Fredrik Lundh833bf942006-05-23 10:12:21 +00005218 for (; start <= end; start++)
5219 if (Py_UNICODE_MATCH(u, start, v)) {
5220 result = 1;
5221 break;
5222 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005223 }
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00005224#endif
Guido van Rossum403d68b2000-03-13 15:55:09 +00005225
Fredrik Lundh833bf942006-05-23 10:12:21 +00005226done:
Guido van Rossum403d68b2000-03-13 15:55:09 +00005227 Py_DECREF(u);
5228 Py_DECREF(v);
5229 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005230}
5231
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232/* Concat to string or Unicode object giving a new Unicode object. */
5233
5234PyObject *PyUnicode_Concat(PyObject *left,
5235 PyObject *right)
5236{
5237 PyUnicodeObject *u = NULL, *v = NULL, *w;
5238
5239 /* Coerce the two arguments */
5240 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5241 if (u == NULL)
5242 goto onError;
5243 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5244 if (v == NULL)
5245 goto onError;
5246
5247 /* Shortcuts */
5248 if (v == unicode_empty) {
5249 Py_DECREF(v);
5250 return (PyObject *)u;
5251 }
5252 if (u == unicode_empty) {
5253 Py_DECREF(u);
5254 return (PyObject *)v;
5255 }
5256
5257 /* Concat the two Unicode strings */
5258 w = _PyUnicode_New(u->length + v->length);
5259 if (w == NULL)
5260 goto onError;
5261 Py_UNICODE_COPY(w->str, u->str, u->length);
5262 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5263
5264 Py_DECREF(u);
5265 Py_DECREF(v);
5266 return (PyObject *)w;
5267
5268onError:
5269 Py_XDECREF(u);
5270 Py_XDECREF(v);
5271 return NULL;
5272}
5273
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005274PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275"S.count(sub[, start[, end]]) -> int\n\
5276\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005277Return the number of non-overlapping occurrences of substring sub in\n\
5278Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005279interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280
5281static PyObject *
5282unicode_count(PyUnicodeObject *self, PyObject *args)
5283{
5284 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005285 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005286 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 PyObject *result;
5288
Guido van Rossumb8872e62000-05-09 14:14:27 +00005289 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5290 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291 return NULL;
5292
5293 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5294 (PyObject *)substring);
5295 if (substring == NULL)
5296 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005297
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 if (start < 0)
5299 start += self->length;
5300 if (start < 0)
5301 start = 0;
5302 if (end > self->length)
5303 end = self->length;
5304 if (end < 0)
5305 end += self->length;
5306 if (end < 0)
5307 end = 0;
5308
Andrew Dalkeb552c4d2006-05-25 18:03:25 +00005309 result = PyInt_FromSsize_t(count(self, start, end, substring));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310
5311 Py_DECREF(substring);
5312 return result;
5313}
5314
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005315PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005316"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005318Encodes S using the codec registered for encoding. encoding defaults\n\
5319to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005320handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005321a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5322'xmlcharrefreplace' as well as any other name registered with\n\
5323codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324
5325static PyObject *
5326unicode_encode(PyUnicodeObject *self, PyObject *args)
5327{
5328 char *encoding = NULL;
5329 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005330 PyObject *v;
5331
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5333 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005334 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005335 if (v == NULL)
5336 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005337 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5338 PyErr_Format(PyExc_TypeError,
5339 "encoder did not return a string/unicode object "
5340 "(type=%.400s)",
5341 v->ob_type->tp_name);
5342 Py_DECREF(v);
5343 return NULL;
5344 }
5345 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005346
5347 onError:
5348 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005349}
5350
5351PyDoc_STRVAR(decode__doc__,
5352"S.decode([encoding[,errors]]) -> string or unicode\n\
5353\n\
5354Decodes S using the codec registered for encoding. encoding defaults\n\
5355to the default encoding. errors may be given to set a different error\n\
5356handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5357a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5358as well as any other name registerd with codecs.register_error that is\n\
5359able to handle UnicodeDecodeErrors.");
5360
5361static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005362unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005363{
5364 char *encoding = NULL;
5365 char *errors = NULL;
5366 PyObject *v;
5367
5368 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5369 return NULL;
5370 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005371 if (v == NULL)
5372 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005373 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5374 PyErr_Format(PyExc_TypeError,
5375 "decoder did not return a string/unicode object "
5376 "(type=%.400s)",
5377 v->ob_type->tp_name);
5378 Py_DECREF(v);
5379 return NULL;
5380 }
5381 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005382
5383 onError:
5384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385}
5386
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005387PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388"S.expandtabs([tabsize]) -> unicode\n\
5389\n\
5390Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005391If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392
5393static PyObject*
5394unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5395{
5396 Py_UNICODE *e;
5397 Py_UNICODE *p;
5398 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005399 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 PyUnicodeObject *u;
5401 int tabsize = 8;
5402
5403 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5404 return NULL;
5405
Thomas Wouters7e474022000-07-16 12:04:32 +00005406 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 i = j = 0;
5408 e = self->str + self->length;
5409 for (p = self->str; p < e; p++)
5410 if (*p == '\t') {
5411 if (tabsize > 0)
5412 j += tabsize - (j % tabsize);
5413 }
5414 else {
5415 j++;
5416 if (*p == '\n' || *p == '\r') {
5417 i += j;
5418 j = 0;
5419 }
5420 }
5421
5422 /* Second pass: create output string and fill it */
5423 u = _PyUnicode_New(i + j);
5424 if (!u)
5425 return NULL;
5426
5427 j = 0;
5428 q = u->str;
5429
5430 for (p = self->str; p < e; p++)
5431 if (*p == '\t') {
5432 if (tabsize > 0) {
5433 i = tabsize - (j % tabsize);
5434 j += i;
5435 while (i--)
5436 *q++ = ' ';
5437 }
5438 }
5439 else {
5440 j++;
5441 *q++ = *p;
5442 if (*p == '\n' || *p == '\r')
5443 j = 0;
5444 }
5445
5446 return (PyObject*) u;
5447}
5448
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005449PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450"S.find(sub [,start [,end]]) -> int\n\
5451\n\
5452Return the lowest index in S where substring sub is found,\n\
5453such that sub is contained within s[start,end]. Optional\n\
5454arguments start and end are interpreted as in slice notation.\n\
5455\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005456Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457
5458static PyObject *
5459unicode_find(PyUnicodeObject *self, PyObject *args)
5460{
5461 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005462 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005463 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 PyObject *result;
5465
Guido van Rossumb8872e62000-05-09 14:14:27 +00005466 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5467 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 return NULL;
5469 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5470 (PyObject *)substring);
5471 if (substring == NULL)
5472 return NULL;
5473
Martin v. Löwis18e16552006-02-15 17:27:45 +00005474 result = PyInt_FromSsize_t(findstring(self, substring, start, end, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475
5476 Py_DECREF(substring);
5477 return result;
5478}
5479
5480static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005481unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482{
5483 if (index < 0 || index >= self->length) {
5484 PyErr_SetString(PyExc_IndexError, "string index out of range");
5485 return NULL;
5486 }
5487
5488 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5489}
5490
5491static long
5492unicode_hash(PyUnicodeObject *self)
5493{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005494 /* Since Unicode objects compare equal to their ASCII string
5495 counterparts, they should use the individual character values
5496 as basis for their hash value. This is needed to assure that
5497 strings and Unicode objects behave in the same way as
5498 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499
Martin v. Löwis18e16552006-02-15 17:27:45 +00005500 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005501 register Py_UNICODE *p;
5502 register long x;
5503
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504 if (self->hash != -1)
5505 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005506 len = PyUnicode_GET_SIZE(self);
5507 p = PyUnicode_AS_UNICODE(self);
5508 x = *p << 7;
5509 while (--len >= 0)
5510 x = (1000003*x) ^ *p++;
5511 x ^= PyUnicode_GET_SIZE(self);
5512 if (x == -1)
5513 x = -2;
5514 self->hash = x;
5515 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516}
5517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005518PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519"S.index(sub [,start [,end]]) -> int\n\
5520\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005521Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522
5523static PyObject *
5524unicode_index(PyUnicodeObject *self, PyObject *args)
5525{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005526 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005528 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005529 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530
Guido van Rossumb8872e62000-05-09 14:14:27 +00005531 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5532 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005534
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5536 (PyObject *)substring);
5537 if (substring == NULL)
5538 return NULL;
5539
5540 result = findstring(self, substring, start, end, 1);
5541
5542 Py_DECREF(substring);
5543 if (result < 0) {
5544 PyErr_SetString(PyExc_ValueError, "substring not found");
5545 return NULL;
5546 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005547 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548}
5549
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005550PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005551"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005553Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005554at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555
5556static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005557unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558{
5559 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5560 register const Py_UNICODE *e;
5561 int cased;
5562
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 /* Shortcut for single character strings */
5564 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005565 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005567 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005568 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005569 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005570
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 e = p + PyUnicode_GET_SIZE(self);
5572 cased = 0;
5573 for (; p < e; p++) {
5574 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005575
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005577 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 else if (!cased && Py_UNICODE_ISLOWER(ch))
5579 cased = 1;
5580 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005581 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582}
5583
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005584PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005585"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005587Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005588at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589
5590static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005591unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592{
5593 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5594 register const Py_UNICODE *e;
5595 int cased;
5596
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 /* Shortcut for single character strings */
5598 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005599 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005601 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005602 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005603 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005604
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 e = p + PyUnicode_GET_SIZE(self);
5606 cased = 0;
5607 for (; p < e; p++) {
5608 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005609
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005611 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 else if (!cased && Py_UNICODE_ISUPPER(ch))
5613 cased = 1;
5614 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005615 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616}
5617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005618PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005619"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005621Return True if S is a titlecased string and there is at least one\n\
5622character in S, i.e. upper- and titlecase characters may only\n\
5623follow uncased characters and lowercase characters only cased ones.\n\
5624Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625
5626static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005627unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628{
5629 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5630 register const Py_UNICODE *e;
5631 int cased, previous_is_cased;
5632
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633 /* Shortcut for single character strings */
5634 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005635 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5636 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005638 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005639 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005640 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005641
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 e = p + PyUnicode_GET_SIZE(self);
5643 cased = 0;
5644 previous_is_cased = 0;
5645 for (; p < e; p++) {
5646 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005647
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5649 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005650 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651 previous_is_cased = 1;
5652 cased = 1;
5653 }
5654 else if (Py_UNICODE_ISLOWER(ch)) {
5655 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005656 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657 previous_is_cased = 1;
5658 cased = 1;
5659 }
5660 else
5661 previous_is_cased = 0;
5662 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005663 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664}
5665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005666PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005667"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005669Return True if all characters in S are whitespace\n\
5670and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671
5672static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005673unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674{
5675 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5676 register const Py_UNICODE *e;
5677
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 /* Shortcut for single character strings */
5679 if (PyUnicode_GET_SIZE(self) == 1 &&
5680 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005681 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005683 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005684 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005685 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005686
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 e = p + PyUnicode_GET_SIZE(self);
5688 for (; p < e; p++) {
5689 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005690 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005692 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693}
5694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005695PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005696"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005697\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005698Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005699and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005700
5701static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005702unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005703{
5704 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5705 register const Py_UNICODE *e;
5706
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005707 /* Shortcut for single character strings */
5708 if (PyUnicode_GET_SIZE(self) == 1 &&
5709 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005710 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005711
5712 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005713 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005714 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005715
5716 e = p + PyUnicode_GET_SIZE(self);
5717 for (; p < e; p++) {
5718 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005719 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005720 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005721 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005722}
5723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005724PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005725"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005726\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005727Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005728and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005729
5730static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005731unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005732{
5733 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5734 register const Py_UNICODE *e;
5735
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005736 /* Shortcut for single character strings */
5737 if (PyUnicode_GET_SIZE(self) == 1 &&
5738 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005739 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005740
5741 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005742 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005743 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005744
5745 e = p + PyUnicode_GET_SIZE(self);
5746 for (; p < e; p++) {
5747 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005748 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005749 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005750 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005751}
5752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005753PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005754"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005756Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005757False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758
5759static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005760unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761{
5762 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5763 register const Py_UNICODE *e;
5764
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 /* Shortcut for single character strings */
5766 if (PyUnicode_GET_SIZE(self) == 1 &&
5767 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005768 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005770 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005771 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005772 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005773
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774 e = p + PyUnicode_GET_SIZE(self);
5775 for (; p < e; p++) {
5776 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005777 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005779 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780}
5781
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005782PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005783"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005785Return True if all characters in S are digits\n\
5786and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787
5788static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005789unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790{
5791 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5792 register const Py_UNICODE *e;
5793
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794 /* Shortcut for single character strings */
5795 if (PyUnicode_GET_SIZE(self) == 1 &&
5796 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005797 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005799 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005800 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005801 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005802
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803 e = p + PyUnicode_GET_SIZE(self);
5804 for (; p < e; p++) {
5805 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005806 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005808 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809}
5810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005811PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005812"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005814Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005815False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816
5817static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005818unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819{
5820 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5821 register const Py_UNICODE *e;
5822
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823 /* Shortcut for single character strings */
5824 if (PyUnicode_GET_SIZE(self) == 1 &&
5825 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005826 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005828 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005829 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005830 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005831
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 e = p + PyUnicode_GET_SIZE(self);
5833 for (; p < e; p++) {
5834 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005835 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005837 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838}
5839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005840PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841"S.join(sequence) -> unicode\n\
5842\n\
5843Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005844sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845
5846static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005847unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005849 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850}
5851
Martin v. Löwis18e16552006-02-15 17:27:45 +00005852static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853unicode_length(PyUnicodeObject *self)
5854{
5855 return self->length;
5856}
5857
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005858PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005859"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860\n\
5861Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005862done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863
5864static PyObject *
5865unicode_ljust(PyUnicodeObject *self, PyObject *args)
5866{
Martin v. Löwis412fb672006-04-13 06:34:32 +00005867 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005868 Py_UNICODE fillchar = ' ';
5869
Martin v. Löwis412fb672006-04-13 06:34:32 +00005870 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 return NULL;
5872
Tim Peters7a29bd52001-09-12 03:03:31 +00005873 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 Py_INCREF(self);
5875 return (PyObject*) self;
5876 }
5877
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005878 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879}
5880
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005881PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882"S.lower() -> unicode\n\
5883\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005884Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885
5886static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005887unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889 return fixup(self, fixlower);
5890}
5891
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005892#define LEFTSTRIP 0
5893#define RIGHTSTRIP 1
5894#define BOTHSTRIP 2
5895
5896/* Arrays indexed by above */
5897static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5898
5899#define STRIPNAME(i) (stripformat[i]+3)
5900
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005901/* externally visible for str.strip(unicode) */
5902PyObject *
5903_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5904{
5905 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005906 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005907 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005908 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
5909 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005910
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005911 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
5912
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005913 i = 0;
5914 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005915 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
5916 i++;
5917 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005918 }
5919
5920 j = len;
5921 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005922 do {
5923 j--;
5924 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
5925 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005926 }
5927
5928 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005929 Py_INCREF(self);
5930 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005931 }
5932 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005933 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005934}
5935
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936
5937static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005938do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005940 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005941 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005942
5943 i = 0;
5944 if (striptype != RIGHTSTRIP) {
5945 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5946 i++;
5947 }
5948 }
5949
5950 j = len;
5951 if (striptype != LEFTSTRIP) {
5952 do {
5953 j--;
5954 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5955 j++;
5956 }
5957
5958 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5959 Py_INCREF(self);
5960 return (PyObject*)self;
5961 }
5962 else
5963 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964}
5965
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005966
5967static PyObject *
5968do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5969{
5970 PyObject *sep = NULL;
5971
5972 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5973 return NULL;
5974
5975 if (sep != NULL && sep != Py_None) {
5976 if (PyUnicode_Check(sep))
5977 return _PyUnicode_XStrip(self, striptype, sep);
5978 else if (PyString_Check(sep)) {
5979 PyObject *res;
5980 sep = PyUnicode_FromObject(sep);
5981 if (sep==NULL)
5982 return NULL;
5983 res = _PyUnicode_XStrip(self, striptype, sep);
5984 Py_DECREF(sep);
5985 return res;
5986 }
5987 else {
5988 PyErr_Format(PyExc_TypeError,
5989 "%s arg must be None, unicode or str",
5990 STRIPNAME(striptype));
5991 return NULL;
5992 }
5993 }
5994
5995 return do_strip(self, striptype);
5996}
5997
5998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005999PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006000"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006001\n\
6002Return a copy of the string S with leading and trailing\n\
6003whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006004If chars is given and not None, remove characters in chars instead.\n\
6005If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006006
6007static PyObject *
6008unicode_strip(PyUnicodeObject *self, PyObject *args)
6009{
6010 if (PyTuple_GET_SIZE(args) == 0)
6011 return do_strip(self, BOTHSTRIP); /* Common case */
6012 else
6013 return do_argstrip(self, BOTHSTRIP, args);
6014}
6015
6016
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006017PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006018"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006019\n\
6020Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006021If chars is given and not None, remove characters in chars instead.\n\
6022If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006023
6024static PyObject *
6025unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6026{
6027 if (PyTuple_GET_SIZE(args) == 0)
6028 return do_strip(self, LEFTSTRIP); /* Common case */
6029 else
6030 return do_argstrip(self, LEFTSTRIP, args);
6031}
6032
6033
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006034PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006035"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006036\n\
6037Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006038If chars is given and not None, remove characters in chars instead.\n\
6039If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006040
6041static PyObject *
6042unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6043{
6044 if (PyTuple_GET_SIZE(args) == 0)
6045 return do_strip(self, RIGHTSTRIP); /* Common case */
6046 else
6047 return do_argstrip(self, RIGHTSTRIP, args);
6048}
6049
6050
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006052unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053{
6054 PyUnicodeObject *u;
6055 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006056 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006057 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058
6059 if (len < 0)
6060 len = 0;
6061
Tim Peters7a29bd52001-09-12 03:03:31 +00006062 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 /* no repeat, return original string */
6064 Py_INCREF(str);
6065 return (PyObject*) str;
6066 }
Tim Peters8f422462000-09-09 06:13:41 +00006067
6068 /* ensure # of chars needed doesn't overflow int and # of bytes
6069 * needed doesn't overflow size_t
6070 */
6071 nchars = len * str->length;
6072 if (len && nchars / len != str->length) {
6073 PyErr_SetString(PyExc_OverflowError,
6074 "repeated string is too long");
6075 return NULL;
6076 }
6077 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6078 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6079 PyErr_SetString(PyExc_OverflowError,
6080 "repeated string is too long");
6081 return NULL;
6082 }
6083 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 if (!u)
6085 return NULL;
6086
6087 p = u->str;
6088
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006089 if (str->length == 1 && len > 0) {
6090 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006091 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006092 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006093 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006094 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006095 done = str->length;
6096 }
6097 while (done < nchars) {
6098 int n = (done <= nchars-done) ? done : nchars-done;
6099 Py_UNICODE_COPY(p+done, p, n);
6100 done += n;
6101 }
6102 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103
6104 return (PyObject*) u;
6105}
6106
6107PyObject *PyUnicode_Replace(PyObject *obj,
6108 PyObject *subobj,
6109 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006110 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111{
6112 PyObject *self;
6113 PyObject *str1;
6114 PyObject *str2;
6115 PyObject *result;
6116
6117 self = PyUnicode_FromObject(obj);
6118 if (self == NULL)
6119 return NULL;
6120 str1 = PyUnicode_FromObject(subobj);
6121 if (str1 == NULL) {
6122 Py_DECREF(self);
6123 return NULL;
6124 }
6125 str2 = PyUnicode_FromObject(replobj);
6126 if (str2 == NULL) {
6127 Py_DECREF(self);
6128 Py_DECREF(str1);
6129 return NULL;
6130 }
Tim Petersced69f82003-09-16 20:30:58 +00006131 result = replace((PyUnicodeObject *)self,
6132 (PyUnicodeObject *)str1,
6133 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 maxcount);
6135 Py_DECREF(self);
6136 Py_DECREF(str1);
6137 Py_DECREF(str2);
6138 return result;
6139}
6140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006141PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142"S.replace (old, new[, maxsplit]) -> unicode\n\
6143\n\
6144Return a copy of S with all occurrences of substring\n\
6145old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006146given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147
6148static PyObject*
6149unicode_replace(PyUnicodeObject *self, PyObject *args)
6150{
6151 PyUnicodeObject *str1;
6152 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006153 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 PyObject *result;
6155
Martin v. Löwis18e16552006-02-15 17:27:45 +00006156 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 return NULL;
6158 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6159 if (str1 == NULL)
6160 return NULL;
6161 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006162 if (str2 == NULL) {
6163 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006165 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166
6167 result = replace(self, str1, str2, maxcount);
6168
6169 Py_DECREF(str1);
6170 Py_DECREF(str2);
6171 return result;
6172}
6173
6174static
6175PyObject *unicode_repr(PyObject *unicode)
6176{
6177 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6178 PyUnicode_GET_SIZE(unicode),
6179 1);
6180}
6181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006182PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183"S.rfind(sub [,start [,end]]) -> int\n\
6184\n\
6185Return the highest index in S where substring sub is found,\n\
6186such that sub is contained within s[start,end]. Optional\n\
6187arguments start and end are interpreted as in slice notation.\n\
6188\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006189Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190
6191static PyObject *
6192unicode_rfind(PyUnicodeObject *self, PyObject *args)
6193{
6194 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006195 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006196 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 PyObject *result;
6198
Guido van Rossumb8872e62000-05-09 14:14:27 +00006199 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6200 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 return NULL;
6202 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6203 (PyObject *)substring);
6204 if (substring == NULL)
6205 return NULL;
6206
Martin v. Löwis18e16552006-02-15 17:27:45 +00006207 result = PyInt_FromSsize_t(findstring(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208
6209 Py_DECREF(substring);
6210 return result;
6211}
6212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006213PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214"S.rindex(sub [,start [,end]]) -> int\n\
6215\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006216Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217
6218static PyObject *
6219unicode_rindex(PyUnicodeObject *self, PyObject *args)
6220{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006221 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006223 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006224 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225
Guido van Rossumb8872e62000-05-09 14:14:27 +00006226 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6227 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 return NULL;
6229 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6230 (PyObject *)substring);
6231 if (substring == NULL)
6232 return NULL;
6233
6234 result = findstring(self, substring, start, end, -1);
6235
6236 Py_DECREF(substring);
6237 if (result < 0) {
6238 PyErr_SetString(PyExc_ValueError, "substring not found");
6239 return NULL;
6240 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006241 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242}
6243
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006244PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006245"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246\n\
6247Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006248done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249
6250static PyObject *
6251unicode_rjust(PyUnicodeObject *self, PyObject *args)
6252{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006253 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006254 Py_UNICODE fillchar = ' ';
6255
Martin v. Löwis412fb672006-04-13 06:34:32 +00006256 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257 return NULL;
6258
Tim Peters7a29bd52001-09-12 03:03:31 +00006259 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260 Py_INCREF(self);
6261 return (PyObject*) self;
6262 }
6263
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006264 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265}
6266
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006268unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269{
6270 /* standard clamping */
6271 if (start < 0)
6272 start = 0;
6273 if (end < 0)
6274 end = 0;
6275 if (end > self->length)
6276 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006277 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278 /* full slice, return original string */
6279 Py_INCREF(self);
6280 return (PyObject*) self;
6281 }
6282 if (start > end)
6283 start = end;
6284 /* copy slice */
6285 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6286 end - start);
6287}
6288
6289PyObject *PyUnicode_Split(PyObject *s,
6290 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006291 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292{
6293 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006294
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 s = PyUnicode_FromObject(s);
6296 if (s == NULL)
6297 return NULL;
6298 if (sep != NULL) {
6299 sep = PyUnicode_FromObject(sep);
6300 if (sep == NULL) {
6301 Py_DECREF(s);
6302 return NULL;
6303 }
6304 }
6305
6306 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6307
6308 Py_DECREF(s);
6309 Py_XDECREF(sep);
6310 return result;
6311}
6312
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006313PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314"S.split([sep [,maxsplit]]) -> list of strings\n\
6315\n\
6316Return a list of the words in S, using sep as the\n\
6317delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006318splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006319any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320
6321static PyObject*
6322unicode_split(PyUnicodeObject *self, PyObject *args)
6323{
6324 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006325 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326
Martin v. Löwis18e16552006-02-15 17:27:45 +00006327 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328 return NULL;
6329
6330 if (substring == Py_None)
6331 return split(self, NULL, maxcount);
6332 else if (PyUnicode_Check(substring))
6333 return split(self, (PyUnicodeObject *)substring, maxcount);
6334 else
6335 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6336}
6337
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006338PyObject *PyUnicode_RSplit(PyObject *s,
6339 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006340 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006341{
6342 PyObject *result;
6343
6344 s = PyUnicode_FromObject(s);
6345 if (s == NULL)
6346 return NULL;
6347 if (sep != NULL) {
6348 sep = PyUnicode_FromObject(sep);
6349 if (sep == NULL) {
6350 Py_DECREF(s);
6351 return NULL;
6352 }
6353 }
6354
6355 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6356
6357 Py_DECREF(s);
6358 Py_XDECREF(sep);
6359 return result;
6360}
6361
6362PyDoc_STRVAR(rsplit__doc__,
6363"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6364\n\
6365Return a list of the words in S, using sep as the\n\
6366delimiter string, starting at the end of the string and\n\
6367working to the front. If maxsplit is given, at most maxsplit\n\
6368splits are done. If sep is not specified, any whitespace string\n\
6369is a separator.");
6370
6371static PyObject*
6372unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6373{
6374 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006375 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006376
Martin v. Löwis18e16552006-02-15 17:27:45 +00006377 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006378 return NULL;
6379
6380 if (substring == Py_None)
6381 return rsplit(self, NULL, maxcount);
6382 else if (PyUnicode_Check(substring))
6383 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6384 else
6385 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6386}
6387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006388PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006389"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390\n\
6391Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006392Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006393is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394
6395static PyObject*
6396unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6397{
Guido van Rossum86662912000-04-11 15:38:46 +00006398 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399
Guido van Rossum86662912000-04-11 15:38:46 +00006400 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 return NULL;
6402
Guido van Rossum86662912000-04-11 15:38:46 +00006403 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404}
6405
6406static
6407PyObject *unicode_str(PyUnicodeObject *self)
6408{
Fred Drakee4315f52000-05-09 19:53:39 +00006409 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410}
6411
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006412PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413"S.swapcase() -> unicode\n\
6414\n\
6415Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006416and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417
6418static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006419unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 return fixup(self, fixswapcase);
6422}
6423
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006424PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425"S.translate(table) -> unicode\n\
6426\n\
6427Return a copy of the string S, where all characters have been mapped\n\
6428through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006429Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6430Unmapped characters are left untouched. Characters mapped to None\n\
6431are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432
6433static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006434unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435{
Tim Petersced69f82003-09-16 20:30:58 +00006436 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006438 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 "ignore");
6440}
6441
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006442PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443"S.upper() -> unicode\n\
6444\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006445Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446
6447static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006448unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 return fixup(self, fixupper);
6451}
6452
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006453PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454"S.zfill(width) -> unicode\n\
6455\n\
6456Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006457of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458
6459static PyObject *
6460unicode_zfill(PyUnicodeObject *self, PyObject *args)
6461{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006462 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 PyUnicodeObject *u;
6464
Martin v. Löwis18e16552006-02-15 17:27:45 +00006465 Py_ssize_t width;
6466 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 return NULL;
6468
6469 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006470 if (PyUnicode_CheckExact(self)) {
6471 Py_INCREF(self);
6472 return (PyObject*) self;
6473 }
6474 else
6475 return PyUnicode_FromUnicode(
6476 PyUnicode_AS_UNICODE(self),
6477 PyUnicode_GET_SIZE(self)
6478 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 }
6480
6481 fill = width - self->length;
6482
6483 u = pad(self, fill, 0, '0');
6484
Walter Dörwald068325e2002-04-15 13:36:47 +00006485 if (u == NULL)
6486 return NULL;
6487
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488 if (u->str[fill] == '+' || u->str[fill] == '-') {
6489 /* move sign to beginning of string */
6490 u->str[0] = u->str[fill];
6491 u->str[fill] = '0';
6492 }
6493
6494 return (PyObject*) u;
6495}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496
6497#if 0
6498static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006499unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501 return PyInt_FromLong(unicode_freelist_size);
6502}
6503#endif
6504
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006505PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006506"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006508Return True if S starts with the specified prefix, False otherwise.\n\
6509With optional start, test S beginning at that position.\n\
6510With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511
6512static PyObject *
6513unicode_startswith(PyUnicodeObject *self,
6514 PyObject *args)
6515{
6516 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006517 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006518 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 PyObject *result;
6520
Guido van Rossumb8872e62000-05-09 14:14:27 +00006521 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6522 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523 return NULL;
6524 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6525 (PyObject *)substring);
6526 if (substring == NULL)
6527 return NULL;
6528
Guido van Rossum77f6a652002-04-03 22:41:51 +00006529 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530
6531 Py_DECREF(substring);
6532 return result;
6533}
6534
6535
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006536PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006537"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006539Return True if S ends with the specified suffix, False otherwise.\n\
6540With optional start, test S beginning at that position.\n\
6541With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542
6543static PyObject *
6544unicode_endswith(PyUnicodeObject *self,
6545 PyObject *args)
6546{
6547 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006548 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006549 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 PyObject *result;
6551
Guido van Rossumb8872e62000-05-09 14:14:27 +00006552 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6553 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554 return NULL;
6555 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6556 (PyObject *)substring);
6557 if (substring == NULL)
6558 return NULL;
6559
Guido van Rossum77f6a652002-04-03 22:41:51 +00006560 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561
6562 Py_DECREF(substring);
6563 return result;
6564}
6565
6566
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006567
6568static PyObject *
6569unicode_getnewargs(PyUnicodeObject *v)
6570{
6571 return Py_BuildValue("(u#)", v->str, v->length);
6572}
6573
6574
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575static PyMethodDef unicode_methods[] = {
6576
6577 /* Order is according to common usage: often used methods should
6578 appear first, since lookup is done sequentially. */
6579
Georg Brandlecdc0a92006-03-30 12:19:07 +00006580 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006581 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6582 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006583 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006584 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6585 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6586 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6587 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6588 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6589 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6590 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6591 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6592 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6593 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006594 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006595 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006596/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6597 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6598 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6599 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006600 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006601 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006602 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006603 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6604 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6605 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6606 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6607 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6608 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6609 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6610 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6611 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6612 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6613 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6614 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6615 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6616 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006617 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006618#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006619 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620#endif
6621
6622#if 0
6623 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006624 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625#endif
6626
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006627 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 {NULL, NULL}
6629};
6630
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006631static PyObject *
6632unicode_mod(PyObject *v, PyObject *w)
6633{
6634 if (!PyUnicode_Check(v)) {
6635 Py_INCREF(Py_NotImplemented);
6636 return Py_NotImplemented;
6637 }
6638 return PyUnicode_Format(v, w);
6639}
6640
6641static PyNumberMethods unicode_as_number = {
6642 0, /*nb_add*/
6643 0, /*nb_subtract*/
6644 0, /*nb_multiply*/
6645 0, /*nb_divide*/
6646 unicode_mod, /*nb_remainder*/
6647};
6648
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006650 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00006651 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006652 (ssizeargfunc) unicode_repeat, /* sq_repeat */
6653 (ssizeargfunc) unicode_getitem, /* sq_item */
6654 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655 0, /* sq_ass_item */
6656 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00006657 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658};
6659
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006660#define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
6661
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006662static PyObject*
6663unicode_subscript(PyUnicodeObject* self, PyObject* item)
6664{
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006665 PyNumberMethods *nb = item->ob_type->tp_as_number;
6666 if (nb != NULL && HASINDEX(item) && nb->nb_index != NULL) {
6667 Py_ssize_t i = nb->nb_index(item);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006668 if (i == -1 && PyErr_Occurred())
6669 return NULL;
6670 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006671 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006672 return unicode_getitem(self, i);
6673 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006674 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006675 Py_UNICODE* source_buf;
6676 Py_UNICODE* result_buf;
6677 PyObject* result;
6678
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006679 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006680 &start, &stop, &step, &slicelength) < 0) {
6681 return NULL;
6682 }
6683
6684 if (slicelength <= 0) {
6685 return PyUnicode_FromUnicode(NULL, 0);
6686 } else {
6687 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00006688 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
6689 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006690
6691 if (result_buf == NULL)
6692 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006693
6694 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6695 result_buf[i] = source_buf[cur];
6696 }
Tim Petersced69f82003-09-16 20:30:58 +00006697
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006698 result = PyUnicode_FromUnicode(result_buf, slicelength);
6699 PyMem_FREE(result_buf);
6700 return result;
6701 }
6702 } else {
6703 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6704 return NULL;
6705 }
6706}
6707
6708static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006709 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006710 (binaryfunc)unicode_subscript, /* mp_subscript */
6711 (objobjargproc)0, /* mp_ass_subscript */
6712};
6713
Martin v. Löwis18e16552006-02-15 17:27:45 +00006714static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006716 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717 const void **ptr)
6718{
6719 if (index != 0) {
6720 PyErr_SetString(PyExc_SystemError,
6721 "accessing non-existent unicode segment");
6722 return -1;
6723 }
6724 *ptr = (void *) self->str;
6725 return PyUnicode_GET_DATA_SIZE(self);
6726}
6727
Martin v. Löwis18e16552006-02-15 17:27:45 +00006728static Py_ssize_t
6729unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730 const void **ptr)
6731{
6732 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006733 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 return -1;
6735}
6736
6737static int
6738unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006739 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740{
6741 if (lenp)
6742 *lenp = PyUnicode_GET_DATA_SIZE(self);
6743 return 1;
6744}
6745
Martin v. Löwiseb079f12006-02-16 14:32:27 +00006746static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006748 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 const void **ptr)
6750{
6751 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006752
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 if (index != 0) {
6754 PyErr_SetString(PyExc_SystemError,
6755 "accessing non-existent unicode segment");
6756 return -1;
6757 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006758 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759 if (str == NULL)
6760 return -1;
6761 *ptr = (void *) PyString_AS_STRING(str);
6762 return PyString_GET_SIZE(str);
6763}
6764
6765/* Helpers for PyUnicode_Format() */
6766
6767static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006768getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006770 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 if (argidx < arglen) {
6772 (*p_argidx)++;
6773 if (arglen < 0)
6774 return args;
6775 else
6776 return PyTuple_GetItem(args, argidx);
6777 }
6778 PyErr_SetString(PyExc_TypeError,
6779 "not enough arguments for format string");
6780 return NULL;
6781}
6782
6783#define F_LJUST (1<<0)
6784#define F_SIGN (1<<1)
6785#define F_BLANK (1<<2)
6786#define F_ALT (1<<3)
6787#define F_ZERO (1<<4)
6788
Martin v. Löwis18e16552006-02-15 17:27:45 +00006789static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00006790strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006792 register Py_ssize_t i;
6793 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794 for (i = len - 1; i >= 0; i--)
6795 buffer[i] = (Py_UNICODE) charbuffer[i];
6796
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797 return len;
6798}
6799
Neal Norwitzfc76d632006-01-10 06:03:13 +00006800static int
6801doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
6802{
Tim Peters15231542006-02-16 01:08:01 +00006803 Py_ssize_t result;
6804
Neal Norwitzfc76d632006-01-10 06:03:13 +00006805 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006806 result = strtounicode(buffer, (char *)buffer);
6807 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006808}
6809
6810static int
6811longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
6812{
Tim Peters15231542006-02-16 01:08:01 +00006813 Py_ssize_t result;
6814
Neal Norwitzfc76d632006-01-10 06:03:13 +00006815 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006816 result = strtounicode(buffer, (char *)buffer);
6817 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006818}
6819
Guido van Rossum078151d2002-08-11 04:24:12 +00006820/* XXX To save some code duplication, formatfloat/long/int could have been
6821 shared with stringobject.c, converting from 8-bit to Unicode after the
6822 formatting is done. */
6823
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824static int
6825formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006826 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827 int flags,
6828 int prec,
6829 int type,
6830 PyObject *v)
6831{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006832 /* fmt = '%#.' + `prec` + `type`
6833 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834 char fmt[20];
6835 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006836
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 x = PyFloat_AsDouble(v);
6838 if (x == -1.0 && PyErr_Occurred())
6839 return -1;
6840 if (prec < 0)
6841 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6843 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006844 /* Worst case length calc to ensure no buffer overrun:
6845
6846 'g' formats:
6847 fmt = %#.<prec>g
6848 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6849 for any double rep.)
6850 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6851
6852 'f' formats:
6853 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6854 len = 1 + 50 + 1 + prec = 52 + prec
6855
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006856 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006857 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006858
6859 */
6860 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6861 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006862 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006863 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006864 return -1;
6865 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006866 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6867 (flags&F_ALT) ? "#" : "",
6868 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006869 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870}
6871
Tim Peters38fd5b62000-09-21 05:43:11 +00006872static PyObject*
6873formatlong(PyObject *val, int flags, int prec, int type)
6874{
6875 char *buf;
6876 int i, len;
6877 PyObject *str; /* temporary string object. */
6878 PyUnicodeObject *result;
6879
6880 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6881 if (!str)
6882 return NULL;
6883 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006884 if (!result) {
6885 Py_DECREF(str);
6886 return NULL;
6887 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006888 for (i = 0; i < len; i++)
6889 result->str[i] = buf[i];
6890 result->str[len] = 0;
6891 Py_DECREF(str);
6892 return (PyObject*)result;
6893}
6894
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895static int
6896formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006897 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 int flags,
6899 int prec,
6900 int type,
6901 PyObject *v)
6902{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006903 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006904 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6905 * + 1 + 1
6906 * = 24
6907 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006908 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006909 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 long x;
6911
6912 x = PyInt_AsLong(v);
6913 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006914 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006915 if (x < 0 && type == 'u') {
6916 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006917 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006918 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6919 sign = "-";
6920 else
6921 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006923 prec = 1;
6924
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006925 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6926 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006927 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006928 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006929 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006930 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006931 return -1;
6932 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006933
6934 if ((flags & F_ALT) &&
6935 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006936 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006937 * of issues that cause pain:
6938 * - when 0 is being converted, the C standard leaves off
6939 * the '0x' or '0X', which is inconsistent with other
6940 * %#x/%#X conversions and inconsistent with Python's
6941 * hex() function
6942 * - there are platforms that violate the standard and
6943 * convert 0 with the '0x' or '0X'
6944 * (Metrowerks, Compaq Tru64)
6945 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006946 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006947 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006948 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006949 * We can achieve the desired consistency by inserting our
6950 * own '0x' or '0X' prefix, and substituting %x/%X in place
6951 * of %#x/%#X.
6952 *
6953 * Note that this is the same approach as used in
6954 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006955 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006956 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6957 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006958 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006959 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006960 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6961 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006962 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006963 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006964 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00006965 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006966 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00006967 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968}
6969
6970static int
6971formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006972 size_t buflen,
6973 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006975 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006976 if (PyUnicode_Check(v)) {
6977 if (PyUnicode_GET_SIZE(v) != 1)
6978 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006980 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006982 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006983 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006984 goto onError;
6985 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987
6988 else {
6989 /* Integer input truncated to a character */
6990 long x;
6991 x = PyInt_AsLong(v);
6992 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006993 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006994#ifdef Py_UNICODE_WIDE
6995 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006996 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006997 "%c arg not in range(0x110000) "
6998 "(wide Python build)");
6999 return -1;
7000 }
7001#else
7002 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007003 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007004 "%c arg not in range(0x10000) "
7005 "(narrow Python build)");
7006 return -1;
7007 }
7008#endif
7009 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010 }
7011 buf[1] = '\0';
7012 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007013
7014 onError:
7015 PyErr_SetString(PyExc_TypeError,
7016 "%c requires int or char");
7017 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018}
7019
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007020/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7021
7022 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7023 chars are formatted. XXX This is a magic number. Each formatting
7024 routine does bounds checking to ensure no overflow, but a better
7025 solution may be to malloc a buffer of appropriate size for each
7026 format. For now, the current solution is sufficient.
7027*/
7028#define FORMATBUFLEN (size_t)120
7029
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030PyObject *PyUnicode_Format(PyObject *format,
7031 PyObject *args)
7032{
7033 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007034 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035 int args_owned = 0;
7036 PyUnicodeObject *result = NULL;
7037 PyObject *dict = NULL;
7038 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007039
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040 if (format == NULL || args == NULL) {
7041 PyErr_BadInternalCall();
7042 return NULL;
7043 }
7044 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007045 if (uformat == NULL)
7046 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047 fmt = PyUnicode_AS_UNICODE(uformat);
7048 fmtcnt = PyUnicode_GET_SIZE(uformat);
7049
7050 reslen = rescnt = fmtcnt + 100;
7051 result = _PyUnicode_New(reslen);
7052 if (result == NULL)
7053 goto onError;
7054 res = PyUnicode_AS_UNICODE(result);
7055
7056 if (PyTuple_Check(args)) {
7057 arglen = PyTuple_Size(args);
7058 argidx = 0;
7059 }
7060 else {
7061 arglen = -1;
7062 argidx = -2;
7063 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007064 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7065 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 dict = args;
7067
7068 while (--fmtcnt >= 0) {
7069 if (*fmt != '%') {
7070 if (--rescnt < 0) {
7071 rescnt = fmtcnt + 100;
7072 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007073 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007074 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7076 --rescnt;
7077 }
7078 *res++ = *fmt++;
7079 }
7080 else {
7081 /* Got a format specifier */
7082 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007083 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085 Py_UNICODE c = '\0';
7086 Py_UNICODE fill;
7087 PyObject *v = NULL;
7088 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007089 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007091 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007092 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093
7094 fmt++;
7095 if (*fmt == '(') {
7096 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007097 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098 PyObject *key;
7099 int pcount = 1;
7100
7101 if (dict == NULL) {
7102 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007103 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104 goto onError;
7105 }
7106 ++fmt;
7107 --fmtcnt;
7108 keystart = fmt;
7109 /* Skip over balanced parentheses */
7110 while (pcount > 0 && --fmtcnt >= 0) {
7111 if (*fmt == ')')
7112 --pcount;
7113 else if (*fmt == '(')
7114 ++pcount;
7115 fmt++;
7116 }
7117 keylen = fmt - keystart - 1;
7118 if (fmtcnt < 0 || pcount > 0) {
7119 PyErr_SetString(PyExc_ValueError,
7120 "incomplete format key");
7121 goto onError;
7122 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007123#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007124 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125 then looked up since Python uses strings to hold
7126 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007127 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128 key = PyUnicode_EncodeUTF8(keystart,
7129 keylen,
7130 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007131#else
7132 key = PyUnicode_FromUnicode(keystart, keylen);
7133#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134 if (key == NULL)
7135 goto onError;
7136 if (args_owned) {
7137 Py_DECREF(args);
7138 args_owned = 0;
7139 }
7140 args = PyObject_GetItem(dict, key);
7141 Py_DECREF(key);
7142 if (args == NULL) {
7143 goto onError;
7144 }
7145 args_owned = 1;
7146 arglen = -1;
7147 argidx = -2;
7148 }
7149 while (--fmtcnt >= 0) {
7150 switch (c = *fmt++) {
7151 case '-': flags |= F_LJUST; continue;
7152 case '+': flags |= F_SIGN; continue;
7153 case ' ': flags |= F_BLANK; continue;
7154 case '#': flags |= F_ALT; continue;
7155 case '0': flags |= F_ZERO; continue;
7156 }
7157 break;
7158 }
7159 if (c == '*') {
7160 v = getnextarg(args, arglen, &argidx);
7161 if (v == NULL)
7162 goto onError;
7163 if (!PyInt_Check(v)) {
7164 PyErr_SetString(PyExc_TypeError,
7165 "* wants int");
7166 goto onError;
7167 }
7168 width = PyInt_AsLong(v);
7169 if (width < 0) {
7170 flags |= F_LJUST;
7171 width = -width;
7172 }
7173 if (--fmtcnt >= 0)
7174 c = *fmt++;
7175 }
7176 else if (c >= '0' && c <= '9') {
7177 width = c - '0';
7178 while (--fmtcnt >= 0) {
7179 c = *fmt++;
7180 if (c < '0' || c > '9')
7181 break;
7182 if ((width*10) / 10 != width) {
7183 PyErr_SetString(PyExc_ValueError,
7184 "width too big");
7185 goto onError;
7186 }
7187 width = width*10 + (c - '0');
7188 }
7189 }
7190 if (c == '.') {
7191 prec = 0;
7192 if (--fmtcnt >= 0)
7193 c = *fmt++;
7194 if (c == '*') {
7195 v = getnextarg(args, arglen, &argidx);
7196 if (v == NULL)
7197 goto onError;
7198 if (!PyInt_Check(v)) {
7199 PyErr_SetString(PyExc_TypeError,
7200 "* wants int");
7201 goto onError;
7202 }
7203 prec = PyInt_AsLong(v);
7204 if (prec < 0)
7205 prec = 0;
7206 if (--fmtcnt >= 0)
7207 c = *fmt++;
7208 }
7209 else if (c >= '0' && c <= '9') {
7210 prec = c - '0';
7211 while (--fmtcnt >= 0) {
7212 c = Py_CHARMASK(*fmt++);
7213 if (c < '0' || c > '9')
7214 break;
7215 if ((prec*10) / 10 != prec) {
7216 PyErr_SetString(PyExc_ValueError,
7217 "prec too big");
7218 goto onError;
7219 }
7220 prec = prec*10 + (c - '0');
7221 }
7222 }
7223 } /* prec */
7224 if (fmtcnt >= 0) {
7225 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226 if (--fmtcnt >= 0)
7227 c = *fmt++;
7228 }
7229 }
7230 if (fmtcnt < 0) {
7231 PyErr_SetString(PyExc_ValueError,
7232 "incomplete format");
7233 goto onError;
7234 }
7235 if (c != '%') {
7236 v = getnextarg(args, arglen, &argidx);
7237 if (v == NULL)
7238 goto onError;
7239 }
7240 sign = 0;
7241 fill = ' ';
7242 switch (c) {
7243
7244 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007245 pbuf = formatbuf;
7246 /* presume that buffer length is at least 1 */
7247 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248 len = 1;
7249 break;
7250
7251 case 's':
7252 case 'r':
7253 if (PyUnicode_Check(v) && c == 's') {
7254 temp = v;
7255 Py_INCREF(temp);
7256 }
7257 else {
7258 PyObject *unicode;
7259 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007260 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261 else
7262 temp = PyObject_Repr(v);
7263 if (temp == NULL)
7264 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007265 if (PyUnicode_Check(temp))
7266 /* nothing to do */;
7267 else if (PyString_Check(temp)) {
7268 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007269 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007271 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007273 Py_DECREF(temp);
7274 temp = unicode;
7275 if (temp == NULL)
7276 goto onError;
7277 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007278 else {
7279 Py_DECREF(temp);
7280 PyErr_SetString(PyExc_TypeError,
7281 "%s argument has non-string str()");
7282 goto onError;
7283 }
7284 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007285 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286 len = PyUnicode_GET_SIZE(temp);
7287 if (prec >= 0 && len > prec)
7288 len = prec;
7289 break;
7290
7291 case 'i':
7292 case 'd':
7293 case 'u':
7294 case 'o':
7295 case 'x':
7296 case 'X':
7297 if (c == 'i')
7298 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007299 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007300 temp = formatlong(v, flags, prec, c);
7301 if (!temp)
7302 goto onError;
7303 pbuf = PyUnicode_AS_UNICODE(temp);
7304 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007305 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007307 else {
7308 pbuf = formatbuf;
7309 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7310 flags, prec, c, v);
7311 if (len < 0)
7312 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007313 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007314 }
7315 if (flags & F_ZERO)
7316 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317 break;
7318
7319 case 'e':
7320 case 'E':
7321 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007322 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323 case 'g':
7324 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007325 if (c == 'F')
7326 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007327 pbuf = formatbuf;
7328 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7329 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 if (len < 0)
7331 goto onError;
7332 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007333 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334 fill = '0';
7335 break;
7336
7337 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007338 pbuf = formatbuf;
7339 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340 if (len < 0)
7341 goto onError;
7342 break;
7343
7344 default:
7345 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007346 "unsupported format character '%c' (0x%x) "
7347 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007348 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007349 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007350 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351 goto onError;
7352 }
7353 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007354 if (*pbuf == '-' || *pbuf == '+') {
7355 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356 len--;
7357 }
7358 else if (flags & F_SIGN)
7359 sign = '+';
7360 else if (flags & F_BLANK)
7361 sign = ' ';
7362 else
7363 sign = 0;
7364 }
7365 if (width < len)
7366 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007367 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368 reslen -= rescnt;
7369 rescnt = width + fmtcnt + 100;
7370 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007371 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007372 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007373 PyErr_NoMemory();
7374 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007375 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007376 if (_PyUnicode_Resize(&result, reslen) < 0) {
7377 Py_XDECREF(temp);
7378 goto onError;
7379 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380 res = PyUnicode_AS_UNICODE(result)
7381 + reslen - rescnt;
7382 }
7383 if (sign) {
7384 if (fill != ' ')
7385 *res++ = sign;
7386 rescnt--;
7387 if (width > len)
7388 width--;
7389 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007390 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7391 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007392 assert(pbuf[1] == c);
7393 if (fill != ' ') {
7394 *res++ = *pbuf++;
7395 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007396 }
Tim Petersfff53252001-04-12 18:38:48 +00007397 rescnt -= 2;
7398 width -= 2;
7399 if (width < 0)
7400 width = 0;
7401 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007402 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403 if (width > len && !(flags & F_LJUST)) {
7404 do {
7405 --rescnt;
7406 *res++ = fill;
7407 } while (--width > len);
7408 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007409 if (fill == ' ') {
7410 if (sign)
7411 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007412 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007413 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007414 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007415 *res++ = *pbuf++;
7416 *res++ = *pbuf++;
7417 }
7418 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007419 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 res += len;
7421 rescnt -= len;
7422 while (--width >= len) {
7423 --rescnt;
7424 *res++ = ' ';
7425 }
7426 if (dict && (argidx < arglen) && c != '%') {
7427 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007428 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007429 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430 goto onError;
7431 }
7432 Py_XDECREF(temp);
7433 } /* '%' */
7434 } /* until end */
7435 if (argidx < arglen && !dict) {
7436 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007437 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438 goto onError;
7439 }
7440
Thomas Woutersa96affe2006-03-12 00:29:36 +00007441 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7442 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443 if (args_owned) {
7444 Py_DECREF(args);
7445 }
7446 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447 return (PyObject *)result;
7448
7449 onError:
7450 Py_XDECREF(result);
7451 Py_DECREF(uformat);
7452 if (args_owned) {
7453 Py_DECREF(args);
7454 }
7455 return NULL;
7456}
7457
7458static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007459 (readbufferproc) unicode_buffer_getreadbuf,
7460 (writebufferproc) unicode_buffer_getwritebuf,
7461 (segcountproc) unicode_buffer_getsegcount,
7462 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463};
7464
Jeremy Hylton938ace62002-07-17 16:30:39 +00007465static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007466unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7467
Tim Peters6d6c1a32001-08-02 04:15:00 +00007468static PyObject *
7469unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7470{
7471 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007472 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007473 char *encoding = NULL;
7474 char *errors = NULL;
7475
Guido van Rossume023fe02001-08-30 03:12:59 +00007476 if (type != &PyUnicode_Type)
7477 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007478 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7479 kwlist, &x, &encoding, &errors))
7480 return NULL;
7481 if (x == NULL)
7482 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007483 if (encoding == NULL && errors == NULL)
7484 return PyObject_Unicode(x);
7485 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007486 return PyUnicode_FromEncodedObject(x, encoding, errors);
7487}
7488
Guido van Rossume023fe02001-08-30 03:12:59 +00007489static PyObject *
7490unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7491{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007492 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007493 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007494
7495 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7496 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7497 if (tmp == NULL)
7498 return NULL;
7499 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007500 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007501 if (pnew == NULL) {
7502 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007503 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007504 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007505 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7506 if (pnew->str == NULL) {
7507 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007508 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007509 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007510 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007511 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007512 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7513 pnew->length = n;
7514 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007515 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007516 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007517}
7518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007519PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007520"unicode(string [, encoding[, errors]]) -> object\n\
7521\n\
7522Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007523encoding defaults to the current default string encoding.\n\
7524errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007525
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526PyTypeObject PyUnicode_Type = {
7527 PyObject_HEAD_INIT(&PyType_Type)
7528 0, /* ob_size */
7529 "unicode", /* tp_name */
7530 sizeof(PyUnicodeObject), /* tp_size */
7531 0, /* tp_itemsize */
7532 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007533 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007535 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 0, /* tp_setattr */
7537 (cmpfunc) unicode_compare, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007538 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007539 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007541 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542 (hashfunc) unicode_hash, /* tp_hash*/
7543 0, /* tp_call*/
7544 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007545 PyObject_GenericGetAttr, /* tp_getattro */
7546 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007548 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7549 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007550 unicode_doc, /* tp_doc */
7551 0, /* tp_traverse */
7552 0, /* tp_clear */
7553 0, /* tp_richcompare */
7554 0, /* tp_weaklistoffset */
7555 0, /* tp_iter */
7556 0, /* tp_iternext */
7557 unicode_methods, /* tp_methods */
7558 0, /* tp_members */
7559 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007560 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007561 0, /* tp_dict */
7562 0, /* tp_descr_get */
7563 0, /* tp_descr_set */
7564 0, /* tp_dictoffset */
7565 0, /* tp_init */
7566 0, /* tp_alloc */
7567 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007568 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569};
7570
7571/* Initialize the Unicode implementation */
7572
Thomas Wouters78890102000-07-22 19:25:51 +00007573void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007575 int i;
7576
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007577 /* XXX - move this array to unicodectype.c ? */
7578 Py_UNICODE linebreak[] = {
7579 0x000A, /* LINE FEED */
7580 0x000D, /* CARRIAGE RETURN */
7581 0x001C, /* FILE SEPARATOR */
7582 0x001D, /* GROUP SEPARATOR */
7583 0x001E, /* RECORD SEPARATOR */
7584 0x0085, /* NEXT LINE */
7585 0x2028, /* LINE SEPARATOR */
7586 0x2029, /* PARAGRAPH SEPARATOR */
7587 };
7588
Fred Drakee4315f52000-05-09 19:53:39 +00007589 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007590 unicode_freelist = NULL;
7591 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007593 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007594 for (i = 0; i < 256; i++)
7595 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007596 if (PyType_Ready(&PyUnicode_Type) < 0)
7597 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007598
7599 /* initialize the linebreak bloom filter */
7600 bloom_linebreak = make_bloom_mask(
7601 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
7602 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603}
7604
7605/* Finalize the Unicode implementation */
7606
7607void
Thomas Wouters78890102000-07-22 19:25:51 +00007608_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007610 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007611 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007613 Py_XDECREF(unicode_empty);
7614 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007615
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007616 for (i = 0; i < 256; i++) {
7617 if (unicode_latin1[i]) {
7618 Py_DECREF(unicode_latin1[i]);
7619 unicode_latin1[i] = NULL;
7620 }
7621 }
7622
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007623 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624 PyUnicodeObject *v = u;
7625 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007626 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007627 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007628 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007629 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007631 unicode_freelist = NULL;
7632 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007634
Anthony Baxterac6bd462006-04-13 02:06:09 +00007635#ifdef __cplusplus
7636}
7637#endif
7638
7639
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007640/*
7641Local variables:
7642c-basic-offset: 4
7643indent-tabs-mode: nil
7644End:
7645*/