blob: 82914008e0ac752bd49df78112f1cea4cc019689 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Fredrik Lundhb63588c2006-05-23 18:44:25 +000052#undef USE_INLINE /* XXX - set via configure? */
53
54#if defined(_MSC_VER) /* this is taken from _sre.c */
55#pragma warning(disable: 4710)
56/* fastest possible local call under MSVC */
57#define LOCAL(type) static __inline type __fastcall
58#elif defined(USE_INLINE)
59#define LOCAL(type) static inline type
60#else
61#define LOCAL(type) static type
62#endif
63
Guido van Rossumd57fd912000-03-10 22:53:23 +000064/* Limit for the Unicode object free list */
65
66#define MAX_UNICODE_FREELIST_SIZE 1024
67
68/* Limit for the Unicode object free list stay alive optimization.
69
70 The implementation will keep allocated Unicode memory intact for
71 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000072 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000073
Barry Warsaw51ac5802000-03-20 16:36:48 +000074 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000075 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000076 malloc()-overhead) bytes of unused garbage.
77
78 Setting the limit to 0 effectively turns the feature off.
79
Guido van Rossumfd4b9572000-04-10 13:51:10 +000080 Note: This is an experimental feature ! If you get core dumps when
81 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000082
83*/
84
Guido van Rossumfd4b9572000-04-10 13:51:10 +000085#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
87/* Endianness switches; defaults to little endian */
88
89#ifdef WORDS_BIGENDIAN
90# define BYTEORDER_IS_BIG_ENDIAN
91#else
92# define BYTEORDER_IS_LITTLE_ENDIAN
93#endif
94
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000095/* --- Globals ------------------------------------------------------------
96
97 The globals are initialized by the _PyUnicode_Init() API and should
98 not be used before calling that API.
99
100*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000101
Anthony Baxterac6bd462006-04-13 02:06:09 +0000102
103#ifdef __cplusplus
104extern "C" {
105#endif
106
Guido van Rossumd57fd912000-03-10 22:53:23 +0000107/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000108static PyUnicodeObject *unicode_freelist;
109static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000110
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000111/* The empty Unicode object is shared to improve performance. */
112static PyUnicodeObject *unicode_empty;
113
114/* Single character Unicode strings in the Latin-1 range are being
115 shared as well. */
116static PyUnicodeObject *unicode_latin1[256];
117
Fred Drakee4315f52000-05-09 19:53:39 +0000118/* Default encoding to use and assume when NULL is passed as encoding
119 parameter; it is initialized by _PyUnicode_Init().
120
121 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000122 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000123
124*/
Fred Drakee4315f52000-05-09 19:53:39 +0000125static char unicode_default_encoding[100];
126
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000127Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000128PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000129{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000130#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000131 return 0x10FFFF;
132#else
133 /* This is actually an illegal character, so it should
134 not be passed to unichr. */
135 return 0xFFFF;
136#endif
137}
138
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000139/* --- Bloom Filters ----------------------------------------------------- */
140
141/* stuff to implement simple "bloom filters" for Unicode characters.
142 to keep things simple, we use a single bitmask, using the least 5
143 bits from each unicode characters as the bit index. */
144
145/* the linebreak mask is set up by Unicode_Init below */
146
147#define BLOOM_MASK unsigned long
148
149static BLOOM_MASK bloom_linebreak;
150
151#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
152
153#define BLOOM_LINEBREAK(ch)\
154 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
155
156LOCAL(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
157{
158 /* calculate simple bloom-style bitmask for a given unicode string */
159
160 long mask;
161 Py_ssize_t i;
162
163 mask = 0;
164 for (i = 0; i < len; i++)
165 mask |= (1 << (ptr[i] & 0x1F));
166
167 return mask;
168}
169
170LOCAL(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
171{
172 Py_ssize_t i;
173
174 for (i = 0; i < setlen; i++)
175 if (set[i] == chr)
176 return 1;
177
Fredrik Lundh77633512006-05-23 19:47:35 +0000178 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000179}
180
181#define BLOOM_MEMBER(mask, chr, set, setlen)\
182 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
183
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184/* --- Unicode Object ----------------------------------------------------- */
185
186static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000188 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189{
190 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000191
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000192 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000194 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000195
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196 /* Resizing shared object (unicode_empty or single character
197 objects) in-place is not allowed. Use PyUnicode_Resize()
198 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000199
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000200 if (unicode == unicode_empty ||
201 (unicode->length == 1 &&
202 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 return -1;
207 }
208
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000209 /* We allocate one more byte to make sure the string is Ux0000 terminated.
210 The overallocation is also used by fastsearch, which assumes that it's
211 safe to look at str[length] (without makeing any assumptions about what
212 it contains). */
213
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 oldstr = unicode->str;
215 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
216 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000217 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 PyErr_NoMemory();
219 return -1;
220 }
221 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000222 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000224 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000226 if (unicode->defenc) {
227 Py_DECREF(unicode->defenc);
228 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 }
230 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000231
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232 return 0;
233}
234
235/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000236 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237
238 XXX This allocator could further be enhanced by assuring that the
239 free list never reduces its size below 1.
240
241*/
242
243static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000244PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245{
246 register PyUnicodeObject *unicode;
247
Tim Petersced69f82003-09-16 20:30:58 +0000248 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249 if (length == 0 && unicode_empty != NULL) {
250 Py_INCREF(unicode_empty);
251 return unicode_empty;
252 }
253
254 /* Unicode freelist & memory allocation */
255 if (unicode_freelist) {
256 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000257 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000260 /* Keep-Alive optimization: we only upsize the buffer,
261 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000262 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000263 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000264 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000265 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 }
267 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000268 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000270 }
271 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 }
273 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000274 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (unicode == NULL)
276 return NULL;
277 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
278 }
279
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000280 if (!unicode->str) {
281 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000282 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000283 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000284 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000285 * the caller fails before initializing str -- unicode_resize()
286 * reads str[0], and the Keep-Alive optimization can keep memory
287 * allocated for str alive across a call to unicode_dealloc(unicode).
288 * We don't want unicode_resize to read uninitialized memory in
289 * that case.
290 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000291 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000293 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000295 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000297
298 onError:
299 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000300 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000301 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302}
303
304static
Guido van Rossum9475a232001-10-05 20:51:39 +0000305void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000307 if (PyUnicode_CheckExact(unicode) &&
308 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000309 /* Keep-Alive optimization */
310 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000311 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 unicode->str = NULL;
313 unicode->length = 0;
314 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000315 if (unicode->defenc) {
316 Py_DECREF(unicode->defenc);
317 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000318 }
319 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 *(PyUnicodeObject **)unicode = unicode_freelist;
321 unicode_freelist = unicode;
322 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 }
324 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000325 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000326 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000327 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000328 }
329}
330
Martin v. Löwis18e16552006-02-15 17:27:45 +0000331int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000332{
333 register PyUnicodeObject *v;
334
335 /* Argument checks */
336 if (unicode == NULL) {
337 PyErr_BadInternalCall();
338 return -1;
339 }
340 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000341 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 PyErr_BadInternalCall();
343 return -1;
344 }
345
346 /* Resizing unicode_empty and single character objects is not
347 possible since these are being shared. We simply return a fresh
348 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000349 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000350 (v == unicode_empty || v->length == 1)) {
351 PyUnicodeObject *w = _PyUnicode_New(length);
352 if (w == NULL)
353 return -1;
354 Py_UNICODE_COPY(w->str, v->str,
355 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000356 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000357 *unicode = (PyObject *)w;
358 return 0;
359 }
360
361 /* Note that we don't have to modify *unicode for unshared Unicode
362 objects, since we can modify them in-place. */
363 return unicode_resize(v, length);
364}
365
366/* Internal API for use in unicodeobject.c only ! */
367#define _PyUnicode_Resize(unicodevar, length) \
368 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
369
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000371 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372{
373 PyUnicodeObject *unicode;
374
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000375 /* If the Unicode data is known at construction time, we can apply
376 some optimizations which share commonly used objects. */
377 if (u != NULL) {
378
379 /* Optimization for empty strings */
380 if (size == 0 && unicode_empty != NULL) {
381 Py_INCREF(unicode_empty);
382 return (PyObject *)unicode_empty;
383 }
384
385 /* Single character Unicode objects in the Latin-1 range are
386 shared when using this constructor */
387 if (size == 1 && *u < 256) {
388 unicode = unicode_latin1[*u];
389 if (!unicode) {
390 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 if (!unicode)
392 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000393 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000394 unicode_latin1[*u] = unicode;
395 }
396 Py_INCREF(unicode);
397 return (PyObject *)unicode;
398 }
399 }
Tim Petersced69f82003-09-16 20:30:58 +0000400
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401 unicode = _PyUnicode_New(size);
402 if (!unicode)
403 return NULL;
404
405 /* Copy the Unicode data into the new object */
406 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000407 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408
409 return (PyObject *)unicode;
410}
411
412#ifdef HAVE_WCHAR_H
413
414PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000415 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416{
417 PyUnicodeObject *unicode;
418
419 if (w == NULL) {
420 PyErr_BadInternalCall();
421 return NULL;
422 }
423
424 unicode = _PyUnicode_New(size);
425 if (!unicode)
426 return NULL;
427
428 /* Copy the wchar_t data into the new object */
429#ifdef HAVE_USABLE_WCHAR_T
430 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000431#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 {
433 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000434 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000436 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000437 *u++ = *w++;
438 }
439#endif
440
441 return (PyObject *)unicode;
442}
443
Martin v. Löwis18e16552006-02-15 17:27:45 +0000444Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
445 wchar_t *w,
446 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000447{
448 if (unicode == NULL) {
449 PyErr_BadInternalCall();
450 return -1;
451 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000452
453 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000455 size = PyUnicode_GET_SIZE(unicode) + 1;
456
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457#ifdef HAVE_USABLE_WCHAR_T
458 memcpy(w, unicode->str, size * sizeof(wchar_t));
459#else
460 {
461 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000462 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000463 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000464 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000465 *w++ = *u++;
466 }
467#endif
468
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000469 if (size > PyUnicode_GET_SIZE(unicode))
470 return PyUnicode_GET_SIZE(unicode);
471 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472 return size;
473}
474
475#endif
476
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000477PyObject *PyUnicode_FromOrdinal(int ordinal)
478{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000479 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000480
481#ifdef Py_UNICODE_WIDE
482 if (ordinal < 0 || ordinal > 0x10ffff) {
483 PyErr_SetString(PyExc_ValueError,
484 "unichr() arg not in range(0x110000) "
485 "(wide Python build)");
486 return NULL;
487 }
488#else
489 if (ordinal < 0 || ordinal > 0xffff) {
490 PyErr_SetString(PyExc_ValueError,
491 "unichr() arg not in range(0x10000) "
492 "(narrow Python build)");
493 return NULL;
494 }
495#endif
496
Hye-Shik Chang40574832004-04-06 07:24:51 +0000497 s[0] = (Py_UNICODE)ordinal;
498 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000499}
500
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501PyObject *PyUnicode_FromObject(register PyObject *obj)
502{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000503 /* XXX Perhaps we should make this API an alias of
504 PyObject_Unicode() instead ?! */
505 if (PyUnicode_CheckExact(obj)) {
506 Py_INCREF(obj);
507 return obj;
508 }
509 if (PyUnicode_Check(obj)) {
510 /* For a Unicode subtype that's not a Unicode object,
511 return a true Unicode object with the same data. */
512 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
513 PyUnicode_GET_SIZE(obj));
514 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000515 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
516}
517
518PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
519 const char *encoding,
520 const char *errors)
521{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000522 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000523 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000524 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000525
Guido van Rossumd57fd912000-03-10 22:53:23 +0000526 if (obj == NULL) {
527 PyErr_BadInternalCall();
528 return NULL;
529 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000530
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000531#if 0
532 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000533 that no encodings is given and then redirect to
534 PyObject_Unicode() which then applies the additional logic for
535 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000536
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537 NOTE: This API should really only be used for object which
538 represent *encoded* Unicode !
539
540 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000541 if (PyUnicode_Check(obj)) {
542 if (encoding) {
543 PyErr_SetString(PyExc_TypeError,
544 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000545 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000546 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000547 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000548 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000549#else
550 if (PyUnicode_Check(obj)) {
551 PyErr_SetString(PyExc_TypeError,
552 "decoding Unicode is not supported");
553 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000554 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555#endif
556
557 /* Coerce object */
558 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000559 s = PyString_AS_STRING(obj);
560 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000561 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000562 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
563 /* Overwrite the error message with something more useful in
564 case of a TypeError. */
565 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000566 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000567 "coercing to Unicode: need string or buffer, "
568 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000569 obj->ob_type->tp_name);
570 goto onError;
571 }
Tim Petersced69f82003-09-16 20:30:58 +0000572
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000573 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000574 if (len == 0) {
575 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000576 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 }
Tim Petersced69f82003-09-16 20:30:58 +0000578 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000579 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000580
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000581 return v;
582
583 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000584 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000585}
586
587PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000588 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000589 const char *encoding,
590 const char *errors)
591{
592 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000593
594 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000595 encoding = PyUnicode_GetDefaultEncoding();
596
597 /* Shortcuts for common default encodings */
598 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000600 else if (strcmp(encoding, "latin-1") == 0)
601 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000602#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
603 else if (strcmp(encoding, "mbcs") == 0)
604 return PyUnicode_DecodeMBCS(s, size, errors);
605#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000606 else if (strcmp(encoding, "ascii") == 0)
607 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000608
609 /* Decode via the codec registry */
610 buffer = PyBuffer_FromMemory((void *)s, size);
611 if (buffer == NULL)
612 goto onError;
613 unicode = PyCodec_Decode(buffer, encoding, errors);
614 if (unicode == NULL)
615 goto onError;
616 if (!PyUnicode_Check(unicode)) {
617 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000618 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000619 unicode->ob_type->tp_name);
620 Py_DECREF(unicode);
621 goto onError;
622 }
623 Py_DECREF(buffer);
624 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000625
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 onError:
627 Py_XDECREF(buffer);
628 return NULL;
629}
630
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000631PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
632 const char *encoding,
633 const char *errors)
634{
635 PyObject *v;
636
637 if (!PyUnicode_Check(unicode)) {
638 PyErr_BadArgument();
639 goto onError;
640 }
641
642 if (encoding == NULL)
643 encoding = PyUnicode_GetDefaultEncoding();
644
645 /* Decode via the codec registry */
646 v = PyCodec_Decode(unicode, encoding, errors);
647 if (v == NULL)
648 goto onError;
649 return v;
650
651 onError:
652 return NULL;
653}
654
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000656 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 const char *encoding,
658 const char *errors)
659{
660 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000661
Guido van Rossumd57fd912000-03-10 22:53:23 +0000662 unicode = PyUnicode_FromUnicode(s, size);
663 if (unicode == NULL)
664 return NULL;
665 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
666 Py_DECREF(unicode);
667 return v;
668}
669
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000670PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
671 const char *encoding,
672 const char *errors)
673{
674 PyObject *v;
675
676 if (!PyUnicode_Check(unicode)) {
677 PyErr_BadArgument();
678 goto onError;
679 }
680
681 if (encoding == NULL)
682 encoding = PyUnicode_GetDefaultEncoding();
683
684 /* Encode via the codec registry */
685 v = PyCodec_Encode(unicode, encoding, errors);
686 if (v == NULL)
687 goto onError;
688 return v;
689
690 onError:
691 return NULL;
692}
693
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
695 const char *encoding,
696 const char *errors)
697{
698 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000699
Guido van Rossumd57fd912000-03-10 22:53:23 +0000700 if (!PyUnicode_Check(unicode)) {
701 PyErr_BadArgument();
702 goto onError;
703 }
Fred Drakee4315f52000-05-09 19:53:39 +0000704
Tim Petersced69f82003-09-16 20:30:58 +0000705 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000706 encoding = PyUnicode_GetDefaultEncoding();
707
708 /* Shortcuts for common default encodings */
709 if (errors == NULL) {
710 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000711 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000712 else if (strcmp(encoding, "latin-1") == 0)
713 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000714#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
715 else if (strcmp(encoding, "mbcs") == 0)
716 return PyUnicode_AsMBCSString(unicode);
717#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000718 else if (strcmp(encoding, "ascii") == 0)
719 return PyUnicode_AsASCIIString(unicode);
720 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000721
722 /* Encode via the codec registry */
723 v = PyCodec_Encode(unicode, encoding, errors);
724 if (v == NULL)
725 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726 if (!PyString_Check(v)) {
727 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000728 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000729 v->ob_type->tp_name);
730 Py_DECREF(v);
731 goto onError;
732 }
733 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000734
Guido van Rossumd57fd912000-03-10 22:53:23 +0000735 onError:
736 return NULL;
737}
738
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000739PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
740 const char *errors)
741{
742 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
743
744 if (v)
745 return v;
746 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
747 if (v && errors == NULL)
748 ((PyUnicodeObject *)unicode)->defenc = v;
749 return v;
750}
751
Guido van Rossumd57fd912000-03-10 22:53:23 +0000752Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
753{
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
757 }
758 return PyUnicode_AS_UNICODE(unicode);
759
760 onError:
761 return NULL;
762}
763
Martin v. Löwis18e16552006-02-15 17:27:45 +0000764Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000765{
766 if (!PyUnicode_Check(unicode)) {
767 PyErr_BadArgument();
768 goto onError;
769 }
770 return PyUnicode_GET_SIZE(unicode);
771
772 onError:
773 return -1;
774}
775
Thomas Wouters78890102000-07-22 19:25:51 +0000776const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000777{
778 return unicode_default_encoding;
779}
780
781int PyUnicode_SetDefaultEncoding(const char *encoding)
782{
783 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000784
Fred Drakee4315f52000-05-09 19:53:39 +0000785 /* Make sure the encoding is valid. As side effect, this also
786 loads the encoding into the codec registry cache. */
787 v = _PyCodec_Lookup(encoding);
788 if (v == NULL)
789 goto onError;
790 Py_DECREF(v);
791 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000792 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000793 sizeof(unicode_default_encoding));
794 return 0;
795
796 onError:
797 return -1;
798}
799
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000800/* error handling callback helper:
801 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000802 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000803 and adjust various state variables.
804 return 0 on success, -1 on error
805*/
806
807static
808int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
809 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000810 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
811 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000812{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000813 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000814
815 PyObject *restuple = NULL;
816 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000817 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
818 Py_ssize_t requiredsize;
819 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000820 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000821 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000822 int res = -1;
823
824 if (*errorHandler == NULL) {
825 *errorHandler = PyCodec_LookupError(errors);
826 if (*errorHandler == NULL)
827 goto onError;
828 }
829
830 if (*exceptionObject == NULL) {
831 *exceptionObject = PyUnicodeDecodeError_Create(
832 encoding, input, insize, *startinpos, *endinpos, reason);
833 if (*exceptionObject == NULL)
834 goto onError;
835 }
836 else {
837 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
838 goto onError;
839 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
840 goto onError;
841 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
842 goto onError;
843 }
844
845 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
846 if (restuple == NULL)
847 goto onError;
848 if (!PyTuple_Check(restuple)) {
849 PyErr_Format(PyExc_TypeError, &argparse[4]);
850 goto onError;
851 }
852 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
853 goto onError;
854 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000855 newpos = insize+newpos;
856 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000857 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000858 goto onError;
859 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000860
861 /* need more space? (at least enough for what we
862 have+the replacement+the rest of the string (starting
863 at the new input position), so we won't have to check space
864 when there are no errors in the rest of the string) */
865 repptr = PyUnicode_AS_UNICODE(repunicode);
866 repsize = PyUnicode_GET_SIZE(repunicode);
867 requiredsize = *outpos + repsize + insize-newpos;
868 if (requiredsize > outsize) {
869 if (requiredsize<2*outsize)
870 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000871 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000872 goto onError;
873 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
874 }
875 *endinpos = newpos;
876 *inptr = input + newpos;
877 Py_UNICODE_COPY(*outptr, repptr, repsize);
878 *outptr += repsize;
879 *outpos += repsize;
880 /* we made it! */
881 res = 0;
882
883 onError:
884 Py_XDECREF(restuple);
885 return res;
886}
887
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000888/* --- UTF-7 Codec -------------------------------------------------------- */
889
890/* see RFC2152 for details */
891
Tim Petersced69f82003-09-16 20:30:58 +0000892static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000893char utf7_special[128] = {
894 /* indicate whether a UTF-7 character is special i.e. cannot be directly
895 encoded:
896 0 - not special
897 1 - special
898 2 - whitespace (optional)
899 3 - RFC2152 Set O (optional) */
900 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
901 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
902 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
903 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
904 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
905 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
906 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
907 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
908
909};
910
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000911/* Note: The comparison (c) <= 0 is a trick to work-around gcc
912 warnings about the comparison always being false; since
913 utf7_special[0] is 1, we can safely make that one comparison
914 true */
915
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000916#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000917 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000918 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000919 (encodeO && (utf7_special[(c)] == 3)))
920
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000921#define B64(n) \
922 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
923#define B64CHAR(c) \
924 (isalnum(c) || (c) == '+' || (c) == '/')
925#define UB64(c) \
926 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
927 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000928
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000929#define ENCODE(out, ch, bits) \
930 while (bits >= 6) { \
931 *out++ = B64(ch >> (bits-6)); \
932 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000933 }
934
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000935#define DECODE(out, ch, bits, surrogate) \
936 while (bits >= 16) { \
937 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
938 bits -= 16; \
939 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000940 /* We have already generated an error for the high surrogate \
941 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000942 surrogate = 0; \
943 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000945 it in a 16-bit character */ \
946 surrogate = 1; \
947 errmsg = "code pairs are not supported"; \
948 goto utf7Error; \
949 } else { \
950 *out++ = outCh; \
951 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000952 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000953
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000954PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000955 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000956 const char *errors)
957{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000958 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000959 Py_ssize_t startinpos;
960 Py_ssize_t endinpos;
961 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000962 const char *e;
963 PyUnicodeObject *unicode;
964 Py_UNICODE *p;
965 const char *errmsg = "";
966 int inShift = 0;
967 unsigned int bitsleft = 0;
968 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000969 int surrogate = 0;
970 PyObject *errorHandler = NULL;
971 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000972
973 unicode = _PyUnicode_New(size);
974 if (!unicode)
975 return NULL;
976 if (size == 0)
977 return (PyObject *)unicode;
978
979 p = unicode->str;
980 e = s + size;
981
982 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000983 Py_UNICODE ch;
984 restart:
985 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000986
987 if (inShift) {
988 if ((ch == '-') || !B64CHAR(ch)) {
989 inShift = 0;
990 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000991
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000992 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
993 if (bitsleft >= 6) {
994 /* The shift sequence has a partial character in it. If
995 bitsleft < 6 then we could just classify it as padding
996 but that is not the case here */
997
998 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000999 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001000 }
1001 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001002 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001003 here so indicate the potential of a misencoded character. */
1004
1005 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1006 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1007 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001008 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001009 }
1010
1011 if (ch == '-') {
1012 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001013 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001014 inShift = 1;
1015 }
1016 } else if (SPECIAL(ch,0,0)) {
1017 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001018 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001019 } else {
1020 *p++ = ch;
1021 }
1022 } else {
1023 charsleft = (charsleft << 6) | UB64(ch);
1024 bitsleft += 6;
1025 s++;
1026 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1027 }
1028 }
1029 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001030 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001031 s++;
1032 if (s < e && *s == '-') {
1033 s++;
1034 *p++ = '+';
1035 } else
1036 {
1037 inShift = 1;
1038 bitsleft = 0;
1039 }
1040 }
1041 else if (SPECIAL(ch,0,0)) {
1042 errmsg = "unexpected special character";
1043 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001044 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001045 }
1046 else {
1047 *p++ = ch;
1048 s++;
1049 }
1050 continue;
1051 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001052 outpos = p-PyUnicode_AS_UNICODE(unicode);
1053 endinpos = s-starts;
1054 if (unicode_decode_call_errorhandler(
1055 errors, &errorHandler,
1056 "utf7", errmsg,
1057 starts, size, &startinpos, &endinpos, &exc, &s,
1058 (PyObject **)&unicode, &outpos, &p))
1059 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001060 }
1061
1062 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001063 outpos = p-PyUnicode_AS_UNICODE(unicode);
1064 endinpos = size;
1065 if (unicode_decode_call_errorhandler(
1066 errors, &errorHandler,
1067 "utf7", "unterminated shift sequence",
1068 starts, size, &startinpos, &endinpos, &exc, &s,
1069 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001070 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071 if (s < e)
1072 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 }
1074
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001075 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001076 goto onError;
1077
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001078 Py_XDECREF(errorHandler);
1079 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080 return (PyObject *)unicode;
1081
1082onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001083 Py_XDECREF(errorHandler);
1084 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001085 Py_DECREF(unicode);
1086 return NULL;
1087}
1088
1089
1090PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001091 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001092 int encodeSetO,
1093 int encodeWhiteSpace,
1094 const char *errors)
1095{
1096 PyObject *v;
1097 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001098 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001099 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001100 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001101 unsigned int bitsleft = 0;
1102 unsigned long charsleft = 0;
1103 char * out;
1104 char * start;
1105
1106 if (size == 0)
1107 return PyString_FromStringAndSize(NULL, 0);
1108
1109 v = PyString_FromStringAndSize(NULL, cbAllocated);
1110 if (v == NULL)
1111 return NULL;
1112
1113 start = out = PyString_AS_STRING(v);
1114 for (;i < size; ++i) {
1115 Py_UNICODE ch = s[i];
1116
1117 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001118 if (ch == '+') {
1119 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001120 *out++ = '-';
1121 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1122 charsleft = ch;
1123 bitsleft = 16;
1124 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001125 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001126 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001127 } else {
1128 *out++ = (char) ch;
1129 }
1130 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001131 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1132 *out++ = B64(charsleft << (6-bitsleft));
1133 charsleft = 0;
1134 bitsleft = 0;
1135 /* Characters not in the BASE64 set implicitly unshift the sequence
1136 so no '-' is required, except if the character is itself a '-' */
1137 if (B64CHAR(ch) || ch == '-') {
1138 *out++ = '-';
1139 }
1140 inShift = 0;
1141 *out++ = (char) ch;
1142 } else {
1143 bitsleft += 16;
1144 charsleft = (charsleft << 16) | ch;
1145 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1146
1147 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001148 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001149 or '-' then the shift sequence will be terminated implicitly and we
1150 don't have to insert a '-'. */
1151
1152 if (bitsleft == 0) {
1153 if (i + 1 < size) {
1154 Py_UNICODE ch2 = s[i+1];
1155
1156 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001157
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001158 } else if (B64CHAR(ch2) || ch2 == '-') {
1159 *out++ = '-';
1160 inShift = 0;
1161 } else {
1162 inShift = 0;
1163 }
1164
1165 }
1166 else {
1167 *out++ = '-';
1168 inShift = 0;
1169 }
1170 }
Tim Petersced69f82003-09-16 20:30:58 +00001171 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001172 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001173 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001174 if (bitsleft) {
1175 *out++= B64(charsleft << (6-bitsleft) );
1176 *out++ = '-';
1177 }
1178
Tim Peters5de98422002-04-27 18:44:32 +00001179 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001180 return v;
1181}
1182
1183#undef SPECIAL
1184#undef B64
1185#undef B64CHAR
1186#undef UB64
1187#undef ENCODE
1188#undef DECODE
1189
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190/* --- UTF-8 Codec -------------------------------------------------------- */
1191
Tim Petersced69f82003-09-16 20:30:58 +00001192static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193char utf8_code_length[256] = {
1194 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1195 illegal prefix. see RFC 2279 for details */
1196 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1197 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1198 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1199 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1200 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1201 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1202 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1203 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1205 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1206 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1208 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1209 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1210 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1211 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1212};
1213
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001215 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216 const char *errors)
1217{
Walter Dörwald69652032004-09-07 20:24:22 +00001218 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1219}
1220
1221PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001222 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001223 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001224 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001225{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001226 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001228 Py_ssize_t startinpos;
1229 Py_ssize_t endinpos;
1230 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 const char *e;
1232 PyUnicodeObject *unicode;
1233 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001234 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001235 PyObject *errorHandler = NULL;
1236 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001237
1238 /* Note: size will always be longer than the resulting Unicode
1239 character count */
1240 unicode = _PyUnicode_New(size);
1241 if (!unicode)
1242 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001243 if (size == 0) {
1244 if (consumed)
1245 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248
1249 /* Unpack UTF-8 encoded data */
1250 p = unicode->str;
1251 e = s + size;
1252
1253 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001254 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255
1256 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001257 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 s++;
1259 continue;
1260 }
1261
1262 n = utf8_code_length[ch];
1263
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001264 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001265 if (consumed)
1266 break;
1267 else {
1268 errmsg = "unexpected end of data";
1269 startinpos = s-starts;
1270 endinpos = size;
1271 goto utf8Error;
1272 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001273 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274
1275 switch (n) {
1276
1277 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001278 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001279 startinpos = s-starts;
1280 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001281 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001282
1283 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001284 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001285 startinpos = s-starts;
1286 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001287 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288
1289 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001290 if ((s[1] & 0xc0) != 0x80) {
1291 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001292 startinpos = s-starts;
1293 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001294 goto utf8Error;
1295 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001298 startinpos = s-starts;
1299 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001300 errmsg = "illegal encoding";
1301 goto utf8Error;
1302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001304 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001305 break;
1306
1307 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001308 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001309 (s[2] & 0xc0) != 0x80) {
1310 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001311 startinpos = s-starts;
1312 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001313 goto utf8Error;
1314 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001315 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001316 if (ch < 0x0800) {
1317 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001318 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001319
1320 XXX For wide builds (UCS-4) we should probably try
1321 to recombine the surrogates into a single code
1322 unit.
1323 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001324 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001325 startinpos = s-starts;
1326 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001327 goto utf8Error;
1328 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001330 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001331 break;
1332
1333 case 4:
1334 if ((s[1] & 0xc0) != 0x80 ||
1335 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001336 (s[3] & 0xc0) != 0x80) {
1337 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001338 startinpos = s-starts;
1339 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001340 goto utf8Error;
1341 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001342 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1343 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1344 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001345 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001346 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001347 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001348 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001349 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001350 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001351 startinpos = s-starts;
1352 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001353 goto utf8Error;
1354 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001355#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001356 *p++ = (Py_UNICODE)ch;
1357#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001358 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001359
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001360 /* translate from 10000..10FFFF to 0..FFFF */
1361 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001362
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001363 /* high surrogate = top 10 bits added to D800 */
1364 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001365
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001366 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001367 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001368#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001369 break;
1370
1371 default:
1372 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001373 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001374 startinpos = s-starts;
1375 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001376 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 }
1378 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001379 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001380
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001381 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001382 outpos = p-PyUnicode_AS_UNICODE(unicode);
1383 if (unicode_decode_call_errorhandler(
1384 errors, &errorHandler,
1385 "utf8", errmsg,
1386 starts, size, &startinpos, &endinpos, &exc, &s,
1387 (PyObject **)&unicode, &outpos, &p))
1388 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389 }
Walter Dörwald69652032004-09-07 20:24:22 +00001390 if (consumed)
1391 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392
1393 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001394 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001395 goto onError;
1396
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001397 Py_XDECREF(errorHandler);
1398 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001399 return (PyObject *)unicode;
1400
1401onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001402 Py_XDECREF(errorHandler);
1403 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404 Py_DECREF(unicode);
1405 return NULL;
1406}
1407
Tim Peters602f7402002-04-27 18:03:26 +00001408/* Allocation strategy: if the string is short, convert into a stack buffer
1409 and allocate exactly as much space needed at the end. Else allocate the
1410 maximum possible needed (4 result bytes per Unicode character), and return
1411 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001412*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001413PyObject *
1414PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001415 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001416 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417{
Tim Peters602f7402002-04-27 18:03:26 +00001418#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001419
Martin v. Löwis18e16552006-02-15 17:27:45 +00001420 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001421 PyObject *v; /* result string object */
1422 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001423 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001424 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001425 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001426
Tim Peters602f7402002-04-27 18:03:26 +00001427 assert(s != NULL);
1428 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429
Tim Peters602f7402002-04-27 18:03:26 +00001430 if (size <= MAX_SHORT_UNICHARS) {
1431 /* Write into the stack buffer; nallocated can't overflow.
1432 * At the end, we'll allocate exactly as much heap space as it
1433 * turns out we need.
1434 */
1435 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1436 v = NULL; /* will allocate after we're done */
1437 p = stackbuf;
1438 }
1439 else {
1440 /* Overallocate on the heap, and give the excess back at the end. */
1441 nallocated = size * 4;
1442 if (nallocated / 4 != size) /* overflow! */
1443 return PyErr_NoMemory();
1444 v = PyString_FromStringAndSize(NULL, nallocated);
1445 if (v == NULL)
1446 return NULL;
1447 p = PyString_AS_STRING(v);
1448 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001449
Tim Peters602f7402002-04-27 18:03:26 +00001450 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001451 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001452
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001453 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001454 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001455 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001456
Guido van Rossumd57fd912000-03-10 22:53:23 +00001457 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001458 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001459 *p++ = (char)(0xc0 | (ch >> 6));
1460 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001461 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001462 else {
Tim Peters602f7402002-04-27 18:03:26 +00001463 /* Encode UCS2 Unicode ordinals */
1464 if (ch < 0x10000) {
1465 /* Special case: check for high surrogate */
1466 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1467 Py_UCS4 ch2 = s[i];
1468 /* Check for low surrogate and combine the two to
1469 form a UCS4 value */
1470 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001471 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001472 i++;
1473 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001474 }
Tim Peters602f7402002-04-27 18:03:26 +00001475 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001476 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001477 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001478 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1479 *p++ = (char)(0x80 | (ch & 0x3f));
1480 continue;
1481 }
1482encodeUCS4:
1483 /* Encode UCS4 Unicode ordinals */
1484 *p++ = (char)(0xf0 | (ch >> 18));
1485 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1486 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1487 *p++ = (char)(0x80 | (ch & 0x3f));
1488 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001489 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001490
Tim Peters602f7402002-04-27 18:03:26 +00001491 if (v == NULL) {
1492 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001493 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001494 assert(nneeded <= nallocated);
1495 v = PyString_FromStringAndSize(stackbuf, nneeded);
1496 }
1497 else {
1498 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001499 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001500 assert(nneeded <= nallocated);
1501 _PyString_Resize(&v, nneeded);
1502 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001504
Tim Peters602f7402002-04-27 18:03:26 +00001505#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506}
1507
Guido van Rossumd57fd912000-03-10 22:53:23 +00001508PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1509{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001510 if (!PyUnicode_Check(unicode)) {
1511 PyErr_BadArgument();
1512 return NULL;
1513 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001514 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1515 PyUnicode_GET_SIZE(unicode),
1516 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001517}
1518
1519/* --- UTF-16 Codec ------------------------------------------------------- */
1520
Tim Peters772747b2001-08-09 22:21:55 +00001521PyObject *
1522PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001523 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001524 const char *errors,
1525 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526{
Walter Dörwald69652032004-09-07 20:24:22 +00001527 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1528}
1529
1530PyObject *
1531PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001532 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001533 const char *errors,
1534 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001535 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001536{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001537 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001538 Py_ssize_t startinpos;
1539 Py_ssize_t endinpos;
1540 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541 PyUnicodeObject *unicode;
1542 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001543 const unsigned char *q, *e;
1544 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001545 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001546 /* Offsets from q for retrieving byte pairs in the right order. */
1547#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1548 int ihi = 1, ilo = 0;
1549#else
1550 int ihi = 0, ilo = 1;
1551#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001552 PyObject *errorHandler = NULL;
1553 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001554
1555 /* Note: size will always be longer than the resulting Unicode
1556 character count */
1557 unicode = _PyUnicode_New(size);
1558 if (!unicode)
1559 return NULL;
1560 if (size == 0)
1561 return (PyObject *)unicode;
1562
1563 /* Unpack UTF-16 encoded data */
1564 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001565 q = (unsigned char *)s;
1566 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001567
1568 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001569 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001570
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001571 /* Check for BOM marks (U+FEFF) in the input and adjust current
1572 byte order setting accordingly. In native mode, the leading BOM
1573 mark is skipped, in all other modes, it is copied to the output
1574 stream as-is (giving a ZWNBSP character). */
1575 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001576 if (size >= 2) {
1577 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001578#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001579 if (bom == 0xFEFF) {
1580 q += 2;
1581 bo = -1;
1582 }
1583 else if (bom == 0xFFFE) {
1584 q += 2;
1585 bo = 1;
1586 }
Tim Petersced69f82003-09-16 20:30:58 +00001587#else
Walter Dörwald69652032004-09-07 20:24:22 +00001588 if (bom == 0xFEFF) {
1589 q += 2;
1590 bo = 1;
1591 }
1592 else if (bom == 0xFFFE) {
1593 q += 2;
1594 bo = -1;
1595 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001596#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001597 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001598 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599
Tim Peters772747b2001-08-09 22:21:55 +00001600 if (bo == -1) {
1601 /* force LE */
1602 ihi = 1;
1603 ilo = 0;
1604 }
1605 else if (bo == 1) {
1606 /* force BE */
1607 ihi = 0;
1608 ilo = 1;
1609 }
1610
1611 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001612 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001613 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001614 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001615 if (consumed)
1616 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001617 errmsg = "truncated data";
1618 startinpos = ((const char *)q)-starts;
1619 endinpos = ((const char *)e)-starts;
1620 goto utf16Error;
1621 /* The remaining input chars are ignored if the callback
1622 chooses to skip the input */
1623 }
1624 ch = (q[ihi] << 8) | q[ilo];
1625
Tim Peters772747b2001-08-09 22:21:55 +00001626 q += 2;
1627
Guido van Rossumd57fd912000-03-10 22:53:23 +00001628 if (ch < 0xD800 || ch > 0xDFFF) {
1629 *p++ = ch;
1630 continue;
1631 }
1632
1633 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001634 if (q >= e) {
1635 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001636 startinpos = (((const char *)q)-2)-starts;
1637 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001638 goto utf16Error;
1639 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001640 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001641 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1642 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001643 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001644#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001645 *p++ = ch;
1646 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001647#else
1648 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001649#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001650 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001651 }
1652 else {
1653 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001654 startinpos = (((const char *)q)-4)-starts;
1655 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001656 goto utf16Error;
1657 }
1658
Guido van Rossumd57fd912000-03-10 22:53:23 +00001659 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001660 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001661 startinpos = (((const char *)q)-2)-starts;
1662 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001663 /* Fall through to report the error */
1664
1665 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001666 outpos = p-PyUnicode_AS_UNICODE(unicode);
1667 if (unicode_decode_call_errorhandler(
1668 errors, &errorHandler,
1669 "utf16", errmsg,
1670 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1671 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001672 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 }
1674
1675 if (byteorder)
1676 *byteorder = bo;
1677
Walter Dörwald69652032004-09-07 20:24:22 +00001678 if (consumed)
1679 *consumed = (const char *)q-starts;
1680
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001682 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001683 goto onError;
1684
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001685 Py_XDECREF(errorHandler);
1686 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687 return (PyObject *)unicode;
1688
1689onError:
1690 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001691 Py_XDECREF(errorHandler);
1692 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693 return NULL;
1694}
1695
Tim Peters772747b2001-08-09 22:21:55 +00001696PyObject *
1697PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001698 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001699 const char *errors,
1700 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001701{
1702 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001703 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001704#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001705 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001706#else
1707 const int pairs = 0;
1708#endif
Tim Peters772747b2001-08-09 22:21:55 +00001709 /* Offsets from p for storing byte pairs in the right order. */
1710#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1711 int ihi = 1, ilo = 0;
1712#else
1713 int ihi = 0, ilo = 1;
1714#endif
1715
1716#define STORECHAR(CH) \
1717 do { \
1718 p[ihi] = ((CH) >> 8) & 0xff; \
1719 p[ilo] = (CH) & 0xff; \
1720 p += 2; \
1721 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001723#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001724 for (i = pairs = 0; i < size; i++)
1725 if (s[i] >= 0x10000)
1726 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001727#endif
Tim Petersced69f82003-09-16 20:30:58 +00001728 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001729 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730 if (v == NULL)
1731 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732
Tim Peters772747b2001-08-09 22:21:55 +00001733 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001735 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001736 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001737 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001738
1739 if (byteorder == -1) {
1740 /* force LE */
1741 ihi = 1;
1742 ilo = 0;
1743 }
1744 else if (byteorder == 1) {
1745 /* force BE */
1746 ihi = 0;
1747 ilo = 1;
1748 }
1749
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001750 while (size-- > 0) {
1751 Py_UNICODE ch = *s++;
1752 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001753#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001754 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001755 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1756 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001758#endif
Tim Peters772747b2001-08-09 22:21:55 +00001759 STORECHAR(ch);
1760 if (ch2)
1761 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001762 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001764#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765}
1766
1767PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1768{
1769 if (!PyUnicode_Check(unicode)) {
1770 PyErr_BadArgument();
1771 return NULL;
1772 }
1773 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1774 PyUnicode_GET_SIZE(unicode),
1775 NULL,
1776 0);
1777}
1778
1779/* --- Unicode Escape Codec ----------------------------------------------- */
1780
Fredrik Lundh06d12682001-01-24 07:59:11 +00001781static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001782
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001784 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785 const char *errors)
1786{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001787 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001788 Py_ssize_t startinpos;
1789 Py_ssize_t endinpos;
1790 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001791 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001793 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001795 char* message;
1796 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001797 PyObject *errorHandler = NULL;
1798 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001799
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800 /* Escaped strings will always be longer than the resulting
1801 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001802 length after conversion to the true value.
1803 (but if the error callback returns a long replacement string
1804 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805 v = _PyUnicode_New(size);
1806 if (v == NULL)
1807 goto onError;
1808 if (size == 0)
1809 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001810
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001811 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001812 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001813
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 while (s < end) {
1815 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001816 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001817 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001818
1819 /* Non-escape characters are interpreted as Unicode ordinals */
1820 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001821 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001822 continue;
1823 }
1824
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001825 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001826 /* \ - Escapes */
1827 s++;
1828 switch (*s++) {
1829
1830 /* \x escapes */
1831 case '\n': break;
1832 case '\\': *p++ = '\\'; break;
1833 case '\'': *p++ = '\''; break;
1834 case '\"': *p++ = '\"'; break;
1835 case 'b': *p++ = '\b'; break;
1836 case 'f': *p++ = '\014'; break; /* FF */
1837 case 't': *p++ = '\t'; break;
1838 case 'n': *p++ = '\n'; break;
1839 case 'r': *p++ = '\r'; break;
1840 case 'v': *p++ = '\013'; break; /* VT */
1841 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1842
1843 /* \OOO (octal) escapes */
1844 case '0': case '1': case '2': case '3':
1845 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001846 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001847 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001848 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001850 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001852 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001853 break;
1854
Fredrik Lundhccc74732001-02-18 22:13:49 +00001855 /* hex escapes */
1856 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001858 digits = 2;
1859 message = "truncated \\xXX escape";
1860 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001861
Fredrik Lundhccc74732001-02-18 22:13:49 +00001862 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001864 digits = 4;
1865 message = "truncated \\uXXXX escape";
1866 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001867
Fredrik Lundhccc74732001-02-18 22:13:49 +00001868 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001869 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001870 digits = 8;
1871 message = "truncated \\UXXXXXXXX escape";
1872 hexescape:
1873 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001874 outpos = p-PyUnicode_AS_UNICODE(v);
1875 if (s+digits>end) {
1876 endinpos = size;
1877 if (unicode_decode_call_errorhandler(
1878 errors, &errorHandler,
1879 "unicodeescape", "end of string in escape sequence",
1880 starts, size, &startinpos, &endinpos, &exc, &s,
1881 (PyObject **)&v, &outpos, &p))
1882 goto onError;
1883 goto nextByte;
1884 }
1885 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001886 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001887 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001888 endinpos = (s+i+1)-starts;
1889 if (unicode_decode_call_errorhandler(
1890 errors, &errorHandler,
1891 "unicodeescape", message,
1892 starts, size, &startinpos, &endinpos, &exc, &s,
1893 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001894 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001895 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001896 }
1897 chr = (chr<<4) & ~0xF;
1898 if (c >= '0' && c <= '9')
1899 chr += c - '0';
1900 else if (c >= 'a' && c <= 'f')
1901 chr += 10 + c - 'a';
1902 else
1903 chr += 10 + c - 'A';
1904 }
1905 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001906 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001907 /* _decoding_error will have already written into the
1908 target buffer. */
1909 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001910 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001911 /* when we get here, chr is a 32-bit unicode character */
1912 if (chr <= 0xffff)
1913 /* UCS-2 character */
1914 *p++ = (Py_UNICODE) chr;
1915 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001916 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001917 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001918#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001919 *p++ = chr;
1920#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001921 chr -= 0x10000L;
1922 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001923 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001924#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001925 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001926 endinpos = s-starts;
1927 outpos = p-PyUnicode_AS_UNICODE(v);
1928 if (unicode_decode_call_errorhandler(
1929 errors, &errorHandler,
1930 "unicodeescape", "illegal Unicode character",
1931 starts, size, &startinpos, &endinpos, &exc, &s,
1932 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001933 goto onError;
1934 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001935 break;
1936
1937 /* \N{name} */
1938 case 'N':
1939 message = "malformed \\N character escape";
1940 if (ucnhash_CAPI == NULL) {
1941 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001942 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001943 m = PyImport_ImportModule("unicodedata");
1944 if (m == NULL)
1945 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001946 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001947 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001948 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001949 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001950 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001951 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001952 if (ucnhash_CAPI == NULL)
1953 goto ucnhashError;
1954 }
1955 if (*s == '{') {
1956 const char *start = s+1;
1957 /* look for the closing brace */
1958 while (*s != '}' && s < end)
1959 s++;
1960 if (s > start && s < end && *s == '}') {
1961 /* found a name. look it up in the unicode database */
1962 message = "unknown Unicode character name";
1963 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001964 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001965 goto store;
1966 }
1967 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001968 endinpos = s-starts;
1969 outpos = p-PyUnicode_AS_UNICODE(v);
1970 if (unicode_decode_call_errorhandler(
1971 errors, &errorHandler,
1972 "unicodeescape", message,
1973 starts, size, &startinpos, &endinpos, &exc, &s,
1974 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001975 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001976 break;
1977
1978 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001979 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001980 message = "\\ at end of string";
1981 s--;
1982 endinpos = s-starts;
1983 outpos = p-PyUnicode_AS_UNICODE(v);
1984 if (unicode_decode_call_errorhandler(
1985 errors, &errorHandler,
1986 "unicodeescape", message,
1987 starts, size, &startinpos, &endinpos, &exc, &s,
1988 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001989 goto onError;
1990 }
1991 else {
1992 *p++ = '\\';
1993 *p++ = (unsigned char)s[-1];
1994 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001995 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001997 nextByte:
1998 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001999 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002000 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002001 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002002 Py_XDECREF(errorHandler);
2003 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002004 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002005
Fredrik Lundhccc74732001-02-18 22:13:49 +00002006ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002007 PyErr_SetString(
2008 PyExc_UnicodeError,
2009 "\\N escapes not supported (can't load unicodedata module)"
2010 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002011 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002012 Py_XDECREF(errorHandler);
2013 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002014 return NULL;
2015
Fredrik Lundhccc74732001-02-18 22:13:49 +00002016onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002018 Py_XDECREF(errorHandler);
2019 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002020 return NULL;
2021}
2022
2023/* Return a Unicode-Escape string version of the Unicode object.
2024
2025 If quotes is true, the string is enclosed in u"" or u'' quotes as
2026 appropriate.
2027
2028*/
2029
Fredrik Lundh347ee272006-05-24 16:35:18 +00002030LOCAL(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2031 Py_ssize_t size,
2032 Py_UNICODE ch)
2033{
2034 /* like wcschr, but doesn't stop at NULL characters */
2035
2036 while (size-- > 0) {
2037 if (*s == ch)
2038 return s;
2039 s++;
2040 }
2041
2042 return NULL;
2043}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002044
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045static
2046PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002047 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 int quotes)
2049{
2050 PyObject *repr;
2051 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002053 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054
2055 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
2056 if (repr == NULL)
2057 return NULL;
2058
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002059 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060
2061 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002062 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002063 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064 !findchar(s, size, '"')) ? '"' : '\'';
2065 }
2066 while (size-- > 0) {
2067 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002068
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002069 /* Escape quotes and backslashes */
2070 if ((quotes &&
2071 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 *p++ = '\\';
2073 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002074 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002075 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002076
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002077#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002078 /* Map 21-bit characters to '\U00xxxxxx' */
2079 else if (ch >= 0x10000) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00002080 Py_ssize_t offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002081
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002082 /* Resize the string if necessary */
2083 if (offset + 12 > PyString_GET_SIZE(repr)) {
2084 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002085 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002086 p = PyString_AS_STRING(repr) + offset;
2087 }
2088
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002089 *p++ = '\\';
2090 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002091 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2092 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2093 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2094 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2095 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2096 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2097 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002098 *p++ = hexdigit[ch & 0x0000000F];
2099 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002100 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002101#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002102 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2103 else if (ch >= 0xD800 && ch < 0xDC00) {
2104 Py_UNICODE ch2;
2105 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002106
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002107 ch2 = *s++;
2108 size--;
2109 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2110 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2111 *p++ = '\\';
2112 *p++ = 'U';
2113 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2114 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2115 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2116 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2117 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2118 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2119 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2120 *p++ = hexdigit[ucs & 0x0000000F];
2121 continue;
2122 }
2123 /* Fall through: isolated surrogates are copied as-is */
2124 s--;
2125 size++;
2126 }
2127
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002129 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 *p++ = '\\';
2131 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002132 *p++ = hexdigit[(ch >> 12) & 0x000F];
2133 *p++ = hexdigit[(ch >> 8) & 0x000F];
2134 *p++ = hexdigit[(ch >> 4) & 0x000F];
2135 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002137
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002138 /* Map special whitespace to '\t', \n', '\r' */
2139 else if (ch == '\t') {
2140 *p++ = '\\';
2141 *p++ = 't';
2142 }
2143 else if (ch == '\n') {
2144 *p++ = '\\';
2145 *p++ = 'n';
2146 }
2147 else if (ch == '\r') {
2148 *p++ = '\\';
2149 *p++ = 'r';
2150 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002151
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002152 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002153 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002155 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002156 *p++ = hexdigit[(ch >> 4) & 0x000F];
2157 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002158 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002159
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160 /* Copy everything else as-is */
2161 else
2162 *p++ = (char) ch;
2163 }
2164 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002165 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002166
2167 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002168 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169 return repr;
2170}
2171
2172PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002173 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174{
2175 return unicodeescape_string(s, size, 0);
2176}
2177
2178PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2179{
2180 if (!PyUnicode_Check(unicode)) {
2181 PyErr_BadArgument();
2182 return NULL;
2183 }
2184 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2185 PyUnicode_GET_SIZE(unicode));
2186}
2187
2188/* --- Raw Unicode Escape Codec ------------------------------------------- */
2189
2190PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002191 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002192 const char *errors)
2193{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002194 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002195 Py_ssize_t startinpos;
2196 Py_ssize_t endinpos;
2197 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002198 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002199 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002200 const char *end;
2201 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002202 PyObject *errorHandler = NULL;
2203 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002204
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205 /* Escaped strings will always be longer than the resulting
2206 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002207 length after conversion to the true value. (But decoding error
2208 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209 v = _PyUnicode_New(size);
2210 if (v == NULL)
2211 goto onError;
2212 if (size == 0)
2213 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002214 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215 end = s + size;
2216 while (s < end) {
2217 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002218 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002219 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002220 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221
2222 /* Non-escape characters are interpreted as Unicode ordinals */
2223 if (*s != '\\') {
2224 *p++ = (unsigned char)*s++;
2225 continue;
2226 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002227 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228
2229 /* \u-escapes are only interpreted iff the number of leading
2230 backslashes if odd */
2231 bs = s;
2232 for (;s < end;) {
2233 if (*s != '\\')
2234 break;
2235 *p++ = (unsigned char)*s++;
2236 }
2237 if (((s - bs) & 1) == 0 ||
2238 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002239 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002240 continue;
2241 }
2242 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002243 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244 s++;
2245
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002246 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002247 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002248 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002249 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002251 endinpos = s-starts;
2252 if (unicode_decode_call_errorhandler(
2253 errors, &errorHandler,
2254 "rawunicodeescape", "truncated \\uXXXX",
2255 starts, size, &startinpos, &endinpos, &exc, &s,
2256 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002257 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002258 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002259 }
2260 x = (x<<4) & ~0xF;
2261 if (c >= '0' && c <= '9')
2262 x += c - '0';
2263 else if (c >= 'a' && c <= 'f')
2264 x += 10 + c - 'a';
2265 else
2266 x += 10 + c - 'A';
2267 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002268#ifndef Py_UNICODE_WIDE
2269 if (x > 0x10000) {
2270 if (unicode_decode_call_errorhandler(
2271 errors, &errorHandler,
2272 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2273 starts, size, &startinpos, &endinpos, &exc, &s,
2274 (PyObject **)&v, &outpos, &p))
2275 goto onError;
2276 }
2277#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002278 *p++ = x;
2279 nextByte:
2280 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002281 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002282 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002283 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002284 Py_XDECREF(errorHandler);
2285 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002286 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002287
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288 onError:
2289 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002290 Py_XDECREF(errorHandler);
2291 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292 return NULL;
2293}
2294
2295PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002296 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297{
2298 PyObject *repr;
2299 char *p;
2300 char *q;
2301
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002302 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002304#ifdef Py_UNICODE_WIDE
2305 repr = PyString_FromStringAndSize(NULL, 10 * size);
2306#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002308#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002309 if (repr == NULL)
2310 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002311 if (size == 0)
2312 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002313
2314 p = q = PyString_AS_STRING(repr);
2315 while (size-- > 0) {
2316 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002317#ifdef Py_UNICODE_WIDE
2318 /* Map 32-bit characters to '\Uxxxxxxxx' */
2319 if (ch >= 0x10000) {
2320 *p++ = '\\';
2321 *p++ = 'U';
2322 *p++ = hexdigit[(ch >> 28) & 0xf];
2323 *p++ = hexdigit[(ch >> 24) & 0xf];
2324 *p++ = hexdigit[(ch >> 20) & 0xf];
2325 *p++ = hexdigit[(ch >> 16) & 0xf];
2326 *p++ = hexdigit[(ch >> 12) & 0xf];
2327 *p++ = hexdigit[(ch >> 8) & 0xf];
2328 *p++ = hexdigit[(ch >> 4) & 0xf];
2329 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002330 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002331 else
2332#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002333 /* Map 16-bit characters to '\uxxxx' */
2334 if (ch >= 256) {
2335 *p++ = '\\';
2336 *p++ = 'u';
2337 *p++ = hexdigit[(ch >> 12) & 0xf];
2338 *p++ = hexdigit[(ch >> 8) & 0xf];
2339 *p++ = hexdigit[(ch >> 4) & 0xf];
2340 *p++ = hexdigit[ch & 15];
2341 }
2342 /* Copy everything else as-is */
2343 else
2344 *p++ = (char) ch;
2345 }
2346 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002347 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002348 return repr;
2349}
2350
2351PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2352{
2353 if (!PyUnicode_Check(unicode)) {
2354 PyErr_BadArgument();
2355 return NULL;
2356 }
2357 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2358 PyUnicode_GET_SIZE(unicode));
2359}
2360
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002361/* --- Unicode Internal Codec ------------------------------------------- */
2362
2363PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002364 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002365 const char *errors)
2366{
2367 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002368 Py_ssize_t startinpos;
2369 Py_ssize_t endinpos;
2370 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002371 PyUnicodeObject *v;
2372 Py_UNICODE *p;
2373 const char *end;
2374 const char *reason;
2375 PyObject *errorHandler = NULL;
2376 PyObject *exc = NULL;
2377
Neal Norwitzd43069c2006-01-08 01:12:10 +00002378#ifdef Py_UNICODE_WIDE
2379 Py_UNICODE unimax = PyUnicode_GetMax();
2380#endif
2381
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002382 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2383 if (v == NULL)
2384 goto onError;
2385 if (PyUnicode_GetSize((PyObject *)v) == 0)
2386 return (PyObject *)v;
2387 p = PyUnicode_AS_UNICODE(v);
2388 end = s + size;
2389
2390 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002391 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002392 /* We have to sanity check the raw data, otherwise doom looms for
2393 some malformed UCS-4 data. */
2394 if (
2395 #ifdef Py_UNICODE_WIDE
2396 *p > unimax || *p < 0 ||
2397 #endif
2398 end-s < Py_UNICODE_SIZE
2399 )
2400 {
2401 startinpos = s - starts;
2402 if (end-s < Py_UNICODE_SIZE) {
2403 endinpos = end-starts;
2404 reason = "truncated input";
2405 }
2406 else {
2407 endinpos = s - starts + Py_UNICODE_SIZE;
2408 reason = "illegal code point (> 0x10FFFF)";
2409 }
2410 outpos = p - PyUnicode_AS_UNICODE(v);
2411 if (unicode_decode_call_errorhandler(
2412 errors, &errorHandler,
2413 "unicode_internal", reason,
2414 starts, size, &startinpos, &endinpos, &exc, &s,
2415 (PyObject **)&v, &outpos, &p)) {
2416 goto onError;
2417 }
2418 }
2419 else {
2420 p++;
2421 s += Py_UNICODE_SIZE;
2422 }
2423 }
2424
Martin v. Löwis412fb672006-04-13 06:34:32 +00002425 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002426 goto onError;
2427 Py_XDECREF(errorHandler);
2428 Py_XDECREF(exc);
2429 return (PyObject *)v;
2430
2431 onError:
2432 Py_XDECREF(v);
2433 Py_XDECREF(errorHandler);
2434 Py_XDECREF(exc);
2435 return NULL;
2436}
2437
Guido van Rossumd57fd912000-03-10 22:53:23 +00002438/* --- Latin-1 Codec ------------------------------------------------------ */
2439
2440PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002441 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002442 const char *errors)
2443{
2444 PyUnicodeObject *v;
2445 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002446
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002448 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002449 Py_UNICODE r = *(unsigned char*)s;
2450 return PyUnicode_FromUnicode(&r, 1);
2451 }
2452
Guido van Rossumd57fd912000-03-10 22:53:23 +00002453 v = _PyUnicode_New(size);
2454 if (v == NULL)
2455 goto onError;
2456 if (size == 0)
2457 return (PyObject *)v;
2458 p = PyUnicode_AS_UNICODE(v);
2459 while (size-- > 0)
2460 *p++ = (unsigned char)*s++;
2461 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002462
Guido van Rossumd57fd912000-03-10 22:53:23 +00002463 onError:
2464 Py_XDECREF(v);
2465 return NULL;
2466}
2467
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002468/* create or adjust a UnicodeEncodeError */
2469static void make_encode_exception(PyObject **exceptionObject,
2470 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002471 const Py_UNICODE *unicode, Py_ssize_t size,
2472 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002473 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002475 if (*exceptionObject == NULL) {
2476 *exceptionObject = PyUnicodeEncodeError_Create(
2477 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 }
2479 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002480 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2481 goto onError;
2482 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2483 goto onError;
2484 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2485 goto onError;
2486 return;
2487 onError:
2488 Py_DECREF(*exceptionObject);
2489 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 }
2491}
2492
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002493/* raises a UnicodeEncodeError */
2494static void raise_encode_exception(PyObject **exceptionObject,
2495 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002496 const Py_UNICODE *unicode, Py_ssize_t size,
2497 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002498 const char *reason)
2499{
2500 make_encode_exception(exceptionObject,
2501 encoding, unicode, size, startpos, endpos, reason);
2502 if (*exceptionObject != NULL)
2503 PyCodec_StrictErrors(*exceptionObject);
2504}
2505
2506/* error handling callback helper:
2507 build arguments, call the callback and check the arguments,
2508 put the result into newpos and return the replacement string, which
2509 has to be freed by the caller */
2510static PyObject *unicode_encode_call_errorhandler(const char *errors,
2511 PyObject **errorHandler,
2512 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002513 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2514 Py_ssize_t startpos, Py_ssize_t endpos,
2515 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002516{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002517 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002518
2519 PyObject *restuple;
2520 PyObject *resunicode;
2521
2522 if (*errorHandler == NULL) {
2523 *errorHandler = PyCodec_LookupError(errors);
2524 if (*errorHandler == NULL)
2525 return NULL;
2526 }
2527
2528 make_encode_exception(exceptionObject,
2529 encoding, unicode, size, startpos, endpos, reason);
2530 if (*exceptionObject == NULL)
2531 return NULL;
2532
2533 restuple = PyObject_CallFunctionObjArgs(
2534 *errorHandler, *exceptionObject, NULL);
2535 if (restuple == NULL)
2536 return NULL;
2537 if (!PyTuple_Check(restuple)) {
2538 PyErr_Format(PyExc_TypeError, &argparse[4]);
2539 Py_DECREF(restuple);
2540 return NULL;
2541 }
2542 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2543 &resunicode, newpos)) {
2544 Py_DECREF(restuple);
2545 return NULL;
2546 }
2547 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002548 *newpos = size+*newpos;
2549 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002550 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002551 Py_DECREF(restuple);
2552 return NULL;
2553 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002554 Py_INCREF(resunicode);
2555 Py_DECREF(restuple);
2556 return resunicode;
2557}
2558
2559static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002560 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002561 const char *errors,
2562 int limit)
2563{
2564 /* output object */
2565 PyObject *res;
2566 /* pointers to the beginning and end+1 of input */
2567 const Py_UNICODE *startp = p;
2568 const Py_UNICODE *endp = p + size;
2569 /* pointer to the beginning of the unencodable characters */
2570 /* const Py_UNICODE *badp = NULL; */
2571 /* pointer into the output */
2572 char *str;
2573 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002574 Py_ssize_t respos = 0;
2575 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002576 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2577 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002578 PyObject *errorHandler = NULL;
2579 PyObject *exc = NULL;
2580 /* the following variable is used for caching string comparisons
2581 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2582 int known_errorHandler = -1;
2583
2584 /* allocate enough for a simple encoding without
2585 replacements, if we need more, we'll resize */
2586 res = PyString_FromStringAndSize(NULL, size);
2587 if (res == NULL)
2588 goto onError;
2589 if (size == 0)
2590 return res;
2591 str = PyString_AS_STRING(res);
2592 ressize = size;
2593
2594 while (p<endp) {
2595 Py_UNICODE c = *p;
2596
2597 /* can we encode this? */
2598 if (c<limit) {
2599 /* no overflow check, because we know that the space is enough */
2600 *str++ = (char)c;
2601 ++p;
2602 }
2603 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002604 Py_ssize_t unicodepos = p-startp;
2605 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002606 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002607 Py_ssize_t repsize;
2608 Py_ssize_t newpos;
2609 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002610 Py_UNICODE *uni2;
2611 /* startpos for collecting unencodable chars */
2612 const Py_UNICODE *collstart = p;
2613 const Py_UNICODE *collend = p;
2614 /* find all unecodable characters */
2615 while ((collend < endp) && ((*collend)>=limit))
2616 ++collend;
2617 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2618 if (known_errorHandler==-1) {
2619 if ((errors==NULL) || (!strcmp(errors, "strict")))
2620 known_errorHandler = 1;
2621 else if (!strcmp(errors, "replace"))
2622 known_errorHandler = 2;
2623 else if (!strcmp(errors, "ignore"))
2624 known_errorHandler = 3;
2625 else if (!strcmp(errors, "xmlcharrefreplace"))
2626 known_errorHandler = 4;
2627 else
2628 known_errorHandler = 0;
2629 }
2630 switch (known_errorHandler) {
2631 case 1: /* strict */
2632 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2633 goto onError;
2634 case 2: /* replace */
2635 while (collstart++<collend)
2636 *str++ = '?'; /* fall through */
2637 case 3: /* ignore */
2638 p = collend;
2639 break;
2640 case 4: /* xmlcharrefreplace */
2641 respos = str-PyString_AS_STRING(res);
2642 /* determine replacement size (temporarily (mis)uses p) */
2643 for (p = collstart, repsize = 0; p < collend; ++p) {
2644 if (*p<10)
2645 repsize += 2+1+1;
2646 else if (*p<100)
2647 repsize += 2+2+1;
2648 else if (*p<1000)
2649 repsize += 2+3+1;
2650 else if (*p<10000)
2651 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002652#ifndef Py_UNICODE_WIDE
2653 else
2654 repsize += 2+5+1;
2655#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002656 else if (*p<100000)
2657 repsize += 2+5+1;
2658 else if (*p<1000000)
2659 repsize += 2+6+1;
2660 else
2661 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002662#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002663 }
2664 requiredsize = respos+repsize+(endp-collend);
2665 if (requiredsize > ressize) {
2666 if (requiredsize<2*ressize)
2667 requiredsize = 2*ressize;
2668 if (_PyString_Resize(&res, requiredsize))
2669 goto onError;
2670 str = PyString_AS_STRING(res) + respos;
2671 ressize = requiredsize;
2672 }
2673 /* generate replacement (temporarily (mis)uses p) */
2674 for (p = collstart; p < collend; ++p) {
2675 str += sprintf(str, "&#%d;", (int)*p);
2676 }
2677 p = collend;
2678 break;
2679 default:
2680 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2681 encoding, reason, startp, size, &exc,
2682 collstart-startp, collend-startp, &newpos);
2683 if (repunicode == NULL)
2684 goto onError;
2685 /* need more space? (at least enough for what we
2686 have+the replacement+the rest of the string, so
2687 we won't have to check space for encodable characters) */
2688 respos = str-PyString_AS_STRING(res);
2689 repsize = PyUnicode_GET_SIZE(repunicode);
2690 requiredsize = respos+repsize+(endp-collend);
2691 if (requiredsize > ressize) {
2692 if (requiredsize<2*ressize)
2693 requiredsize = 2*ressize;
2694 if (_PyString_Resize(&res, requiredsize)) {
2695 Py_DECREF(repunicode);
2696 goto onError;
2697 }
2698 str = PyString_AS_STRING(res) + respos;
2699 ressize = requiredsize;
2700 }
2701 /* check if there is anything unencodable in the replacement
2702 and copy it to the output */
2703 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2704 c = *uni2;
2705 if (c >= limit) {
2706 raise_encode_exception(&exc, encoding, startp, size,
2707 unicodepos, unicodepos+1, reason);
2708 Py_DECREF(repunicode);
2709 goto onError;
2710 }
2711 *str = (char)c;
2712 }
2713 p = startp + newpos;
2714 Py_DECREF(repunicode);
2715 }
2716 }
2717 }
2718 /* Resize if we allocated to much */
2719 respos = str-PyString_AS_STRING(res);
2720 if (respos<ressize)
2721 /* If this falls res will be NULL */
2722 _PyString_Resize(&res, respos);
2723 Py_XDECREF(errorHandler);
2724 Py_XDECREF(exc);
2725 return res;
2726
2727 onError:
2728 Py_XDECREF(res);
2729 Py_XDECREF(errorHandler);
2730 Py_XDECREF(exc);
2731 return NULL;
2732}
2733
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002735 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736 const char *errors)
2737{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739}
2740
2741PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2742{
2743 if (!PyUnicode_Check(unicode)) {
2744 PyErr_BadArgument();
2745 return NULL;
2746 }
2747 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2748 PyUnicode_GET_SIZE(unicode),
2749 NULL);
2750}
2751
2752/* --- 7-bit ASCII Codec -------------------------------------------------- */
2753
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002755 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 const char *errors)
2757{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002758 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 PyUnicodeObject *v;
2760 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002761 Py_ssize_t startinpos;
2762 Py_ssize_t endinpos;
2763 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002764 const char *e;
2765 PyObject *errorHandler = NULL;
2766 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002767
Guido van Rossumd57fd912000-03-10 22:53:23 +00002768 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002769 if (size == 1 && *(unsigned char*)s < 128) {
2770 Py_UNICODE r = *(unsigned char*)s;
2771 return PyUnicode_FromUnicode(&r, 1);
2772 }
Tim Petersced69f82003-09-16 20:30:58 +00002773
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774 v = _PyUnicode_New(size);
2775 if (v == NULL)
2776 goto onError;
2777 if (size == 0)
2778 return (PyObject *)v;
2779 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002780 e = s + size;
2781 while (s < e) {
2782 register unsigned char c = (unsigned char)*s;
2783 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002785 ++s;
2786 }
2787 else {
2788 startinpos = s-starts;
2789 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002790 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002791 if (unicode_decode_call_errorhandler(
2792 errors, &errorHandler,
2793 "ascii", "ordinal not in range(128)",
2794 starts, size, &startinpos, &endinpos, &exc, &s,
2795 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002799 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002800 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002801 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002802 Py_XDECREF(errorHandler);
2803 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002805
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806 onError:
2807 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002808 Py_XDECREF(errorHandler);
2809 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810 return NULL;
2811}
2812
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002814 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 const char *errors)
2816{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002817 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818}
2819
2820PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2821{
2822 if (!PyUnicode_Check(unicode)) {
2823 PyErr_BadArgument();
2824 return NULL;
2825 }
2826 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2827 PyUnicode_GET_SIZE(unicode),
2828 NULL);
2829}
2830
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002831#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002832
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002833/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002834
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002835PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002836 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002837 const char *errors)
2838{
2839 PyUnicodeObject *v;
2840 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002841 DWORD usize;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002842
2843 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002844 assert(size < INT_MAX);
2845 usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002846 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002847 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2848
2849 v = _PyUnicode_New(usize);
2850 if (v == NULL)
2851 return NULL;
2852 if (usize == 0)
2853 return (PyObject *)v;
2854 p = PyUnicode_AS_UNICODE(v);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002855 if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002856 Py_DECREF(v);
2857 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2858 }
2859
2860 return (PyObject *)v;
2861}
2862
2863PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002864 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002865 const char *errors)
2866{
2867 PyObject *repr;
2868 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002869 DWORD mbcssize;
2870
2871 /* If there are no characters, bail now! */
2872 if (size==0)
2873 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002874
2875 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002876 assert(size<INT_MAX);
2877 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002878 if (mbcssize==0)
2879 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2880
2881 repr = PyString_FromStringAndSize(NULL, mbcssize);
2882 if (repr == NULL)
2883 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002884 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002885 return repr;
2886
2887 /* Do the conversion */
2888 s = PyString_AS_STRING(repr);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002889 assert(size < INT_MAX);
2890 if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002891 Py_DECREF(repr);
2892 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2893 }
2894 return repr;
2895}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002896
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002897PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2898{
2899 if (!PyUnicode_Check(unicode)) {
2900 PyErr_BadArgument();
2901 return NULL;
2902 }
2903 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2904 PyUnicode_GET_SIZE(unicode),
2905 NULL);
2906}
2907
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002908#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002909
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910/* --- Character Mapping Codec -------------------------------------------- */
2911
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002913 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002914 PyObject *mapping,
2915 const char *errors)
2916{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002917 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002918 Py_ssize_t startinpos;
2919 Py_ssize_t endinpos;
2920 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002921 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922 PyUnicodeObject *v;
2923 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002924 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002925 PyObject *errorHandler = NULL;
2926 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002927 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002928 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002929
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930 /* Default to Latin-1 */
2931 if (mapping == NULL)
2932 return PyUnicode_DecodeLatin1(s, size, errors);
2933
2934 v = _PyUnicode_New(size);
2935 if (v == NULL)
2936 goto onError;
2937 if (size == 0)
2938 return (PyObject *)v;
2939 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002940 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002941 if (PyUnicode_CheckExact(mapping)) {
2942 mapstring = PyUnicode_AS_UNICODE(mapping);
2943 maplen = PyUnicode_GET_SIZE(mapping);
2944 while (s < e) {
2945 unsigned char ch = *s;
2946 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002947
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002948 if (ch < maplen)
2949 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002950
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002951 if (x == 0xfffe) {
2952 /* undefined mapping */
2953 outpos = p-PyUnicode_AS_UNICODE(v);
2954 startinpos = s-starts;
2955 endinpos = startinpos+1;
2956 if (unicode_decode_call_errorhandler(
2957 errors, &errorHandler,
2958 "charmap", "character maps to <undefined>",
2959 starts, size, &startinpos, &endinpos, &exc, &s,
2960 (PyObject **)&v, &outpos, &p)) {
2961 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002962 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002963 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002964 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002965 *p++ = x;
2966 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002968 }
2969 else {
2970 while (s < e) {
2971 unsigned char ch = *s;
2972 PyObject *w, *x;
2973
2974 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2975 w = PyInt_FromLong((long)ch);
2976 if (w == NULL)
2977 goto onError;
2978 x = PyObject_GetItem(mapping, w);
2979 Py_DECREF(w);
2980 if (x == NULL) {
2981 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2982 /* No mapping found means: mapping is undefined. */
2983 PyErr_Clear();
2984 x = Py_None;
2985 Py_INCREF(x);
2986 } else
2987 goto onError;
2988 }
2989
2990 /* Apply mapping */
2991 if (PyInt_Check(x)) {
2992 long value = PyInt_AS_LONG(x);
2993 if (value < 0 || value > 65535) {
2994 PyErr_SetString(PyExc_TypeError,
2995 "character mapping must be in range(65536)");
2996 Py_DECREF(x);
2997 goto onError;
2998 }
2999 *p++ = (Py_UNICODE)value;
3000 }
3001 else if (x == Py_None) {
3002 /* undefined mapping */
3003 outpos = p-PyUnicode_AS_UNICODE(v);
3004 startinpos = s-starts;
3005 endinpos = startinpos+1;
3006 if (unicode_decode_call_errorhandler(
3007 errors, &errorHandler,
3008 "charmap", "character maps to <undefined>",
3009 starts, size, &startinpos, &endinpos, &exc, &s,
3010 (PyObject **)&v, &outpos, &p)) {
3011 Py_DECREF(x);
3012 goto onError;
3013 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003014 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003015 continue;
3016 }
3017 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003018 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003019
3020 if (targetsize == 1)
3021 /* 1-1 mapping */
3022 *p++ = *PyUnicode_AS_UNICODE(x);
3023
3024 else if (targetsize > 1) {
3025 /* 1-n mapping */
3026 if (targetsize > extrachars) {
3027 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003028 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3029 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003030 (targetsize << 2);
3031 extrachars += needed;
3032 if (_PyUnicode_Resize(&v,
3033 PyUnicode_GET_SIZE(v) + needed) < 0) {
3034 Py_DECREF(x);
3035 goto onError;
3036 }
3037 p = PyUnicode_AS_UNICODE(v) + oldpos;
3038 }
3039 Py_UNICODE_COPY(p,
3040 PyUnicode_AS_UNICODE(x),
3041 targetsize);
3042 p += targetsize;
3043 extrachars -= targetsize;
3044 }
3045 /* 1-0 mapping: skip the character */
3046 }
3047 else {
3048 /* wrong return value */
3049 PyErr_SetString(PyExc_TypeError,
3050 "character mapping must return integer, None or unicode");
3051 Py_DECREF(x);
3052 goto onError;
3053 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003055 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 }
3058 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003059 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003061 Py_XDECREF(errorHandler);
3062 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003064
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003066 Py_XDECREF(errorHandler);
3067 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068 Py_XDECREF(v);
3069 return NULL;
3070}
3071
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003072/* Lookup the character ch in the mapping. If the character
3073 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003074 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003075static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003076{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003077 PyObject *w = PyInt_FromLong((long)c);
3078 PyObject *x;
3079
3080 if (w == NULL)
3081 return NULL;
3082 x = PyObject_GetItem(mapping, w);
3083 Py_DECREF(w);
3084 if (x == NULL) {
3085 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3086 /* No mapping found means: mapping is undefined. */
3087 PyErr_Clear();
3088 x = Py_None;
3089 Py_INCREF(x);
3090 return x;
3091 } else
3092 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003094 else if (x == Py_None)
3095 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003096 else if (PyInt_Check(x)) {
3097 long value = PyInt_AS_LONG(x);
3098 if (value < 0 || value > 255) {
3099 PyErr_SetString(PyExc_TypeError,
3100 "character mapping must be in range(256)");
3101 Py_DECREF(x);
3102 return NULL;
3103 }
3104 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003106 else if (PyString_Check(x))
3107 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003109 /* wrong return value */
3110 PyErr_SetString(PyExc_TypeError,
3111 "character mapping must return integer, None or str");
3112 Py_DECREF(x);
3113 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114 }
3115}
3116
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003117/* lookup the character, put the result in the output string and adjust
3118 various state variables. Reallocate the output string if not enough
3119 space is available. Return a new reference to the object that
3120 was put in the output buffer, or Py_None, if the mapping was undefined
3121 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003122 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003123static
3124PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003125 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003126{
3127 PyObject *rep = charmapencode_lookup(c, mapping);
3128
3129 if (rep==NULL)
3130 return NULL;
3131 else if (rep==Py_None)
3132 return rep;
3133 else {
3134 char *outstart = PyString_AS_STRING(*outobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003135 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003136 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003137 Py_ssize_t requiredsize = *outpos+1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003138 if (outsize<requiredsize) {
3139 /* exponentially overallocate to minimize reallocations */
3140 if (requiredsize < 2*outsize)
3141 requiredsize = 2*outsize;
3142 if (_PyString_Resize(outobj, requiredsize)) {
3143 Py_DECREF(rep);
3144 return NULL;
3145 }
3146 outstart = PyString_AS_STRING(*outobj);
3147 }
3148 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3149 }
3150 else {
3151 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003152 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3153 Py_ssize_t requiredsize = *outpos+repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003154 if (outsize<requiredsize) {
3155 /* exponentially overallocate to minimize reallocations */
3156 if (requiredsize < 2*outsize)
3157 requiredsize = 2*outsize;
3158 if (_PyString_Resize(outobj, requiredsize)) {
3159 Py_DECREF(rep);
3160 return NULL;
3161 }
3162 outstart = PyString_AS_STRING(*outobj);
3163 }
3164 memcpy(outstart + *outpos, repchars, repsize);
3165 *outpos += repsize;
3166 }
3167 }
3168 return rep;
3169}
3170
3171/* handle an error in PyUnicode_EncodeCharmap
3172 Return 0 on success, -1 on error */
3173static
3174int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003175 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003176 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003177 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003178 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003179{
3180 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003181 Py_ssize_t repsize;
3182 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003183 Py_UNICODE *uni2;
3184 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003185 Py_ssize_t collstartpos = *inpos;
3186 Py_ssize_t collendpos = *inpos+1;
3187 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003188 char *encoding = "charmap";
3189 char *reason = "character maps to <undefined>";
3190
3191 PyObject *x;
3192 /* find all unencodable characters */
3193 while (collendpos < size) {
3194 x = charmapencode_lookup(p[collendpos], mapping);
3195 if (x==NULL)
3196 return -1;
3197 else if (x!=Py_None) {
3198 Py_DECREF(x);
3199 break;
3200 }
3201 Py_DECREF(x);
3202 ++collendpos;
3203 }
3204 /* cache callback name lookup
3205 * (if not done yet, i.e. it's the first error) */
3206 if (*known_errorHandler==-1) {
3207 if ((errors==NULL) || (!strcmp(errors, "strict")))
3208 *known_errorHandler = 1;
3209 else if (!strcmp(errors, "replace"))
3210 *known_errorHandler = 2;
3211 else if (!strcmp(errors, "ignore"))
3212 *known_errorHandler = 3;
3213 else if (!strcmp(errors, "xmlcharrefreplace"))
3214 *known_errorHandler = 4;
3215 else
3216 *known_errorHandler = 0;
3217 }
3218 switch (*known_errorHandler) {
3219 case 1: /* strict */
3220 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3221 return -1;
3222 case 2: /* replace */
3223 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3224 x = charmapencode_output('?', mapping, res, respos);
3225 if (x==NULL) {
3226 return -1;
3227 }
3228 else if (x==Py_None) {
3229 Py_DECREF(x);
3230 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3231 return -1;
3232 }
3233 Py_DECREF(x);
3234 }
3235 /* fall through */
3236 case 3: /* ignore */
3237 *inpos = collendpos;
3238 break;
3239 case 4: /* xmlcharrefreplace */
3240 /* generate replacement (temporarily (mis)uses p) */
3241 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3242 char buffer[2+29+1+1];
3243 char *cp;
3244 sprintf(buffer, "&#%d;", (int)p[collpos]);
3245 for (cp = buffer; *cp; ++cp) {
3246 x = charmapencode_output(*cp, mapping, res, respos);
3247 if (x==NULL)
3248 return -1;
3249 else if (x==Py_None) {
3250 Py_DECREF(x);
3251 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3252 return -1;
3253 }
3254 Py_DECREF(x);
3255 }
3256 }
3257 *inpos = collendpos;
3258 break;
3259 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003260 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003261 encoding, reason, p, size, exceptionObject,
3262 collstartpos, collendpos, &newpos);
3263 if (repunicode == NULL)
3264 return -1;
3265 /* generate replacement */
3266 repsize = PyUnicode_GET_SIZE(repunicode);
3267 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3268 x = charmapencode_output(*uni2, mapping, res, respos);
3269 if (x==NULL) {
3270 Py_DECREF(repunicode);
3271 return -1;
3272 }
3273 else if (x==Py_None) {
3274 Py_DECREF(repunicode);
3275 Py_DECREF(x);
3276 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3277 return -1;
3278 }
3279 Py_DECREF(x);
3280 }
3281 *inpos = newpos;
3282 Py_DECREF(repunicode);
3283 }
3284 return 0;
3285}
3286
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003288 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003289 PyObject *mapping,
3290 const char *errors)
3291{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003292 /* output object */
3293 PyObject *res = NULL;
3294 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003295 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003297 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003298 PyObject *errorHandler = NULL;
3299 PyObject *exc = NULL;
3300 /* the following variable is used for caching string comparisons
3301 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3302 * 3=ignore, 4=xmlcharrefreplace */
3303 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304
3305 /* Default to Latin-1 */
3306 if (mapping == NULL)
3307 return PyUnicode_EncodeLatin1(p, size, errors);
3308
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003309 /* allocate enough for a simple encoding without
3310 replacements, if we need more, we'll resize */
3311 res = PyString_FromStringAndSize(NULL, size);
3312 if (res == NULL)
3313 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003314 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003315 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003317 while (inpos<size) {
3318 /* try to encode it */
3319 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3320 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003322 if (x==Py_None) { /* unencodable character */
3323 if (charmap_encoding_error(p, size, &inpos, mapping,
3324 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003325 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003326 &res, &respos)) {
3327 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003328 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003329 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003331 else
3332 /* done with this character => adjust input position */
3333 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334 Py_DECREF(x);
3335 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337 /* Resize if we allocated to much */
3338 if (respos<PyString_GET_SIZE(res)) {
3339 if (_PyString_Resize(&res, respos))
3340 goto onError;
3341 }
3342 Py_XDECREF(exc);
3343 Py_XDECREF(errorHandler);
3344 return res;
3345
3346 onError:
3347 Py_XDECREF(res);
3348 Py_XDECREF(exc);
3349 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350 return NULL;
3351}
3352
3353PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3354 PyObject *mapping)
3355{
3356 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3357 PyErr_BadArgument();
3358 return NULL;
3359 }
3360 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3361 PyUnicode_GET_SIZE(unicode),
3362 mapping,
3363 NULL);
3364}
3365
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003366/* create or adjust a UnicodeTranslateError */
3367static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003368 const Py_UNICODE *unicode, Py_ssize_t size,
3369 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003370 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003372 if (*exceptionObject == NULL) {
3373 *exceptionObject = PyUnicodeTranslateError_Create(
3374 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003375 }
3376 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003377 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3378 goto onError;
3379 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3380 goto onError;
3381 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3382 goto onError;
3383 return;
3384 onError:
3385 Py_DECREF(*exceptionObject);
3386 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003387 }
3388}
3389
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390/* raises a UnicodeTranslateError */
3391static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003392 const Py_UNICODE *unicode, Py_ssize_t size,
3393 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003394 const char *reason)
3395{
3396 make_translate_exception(exceptionObject,
3397 unicode, size, startpos, endpos, reason);
3398 if (*exceptionObject != NULL)
3399 PyCodec_StrictErrors(*exceptionObject);
3400}
3401
3402/* error handling callback helper:
3403 build arguments, call the callback and check the arguments,
3404 put the result into newpos and return the replacement string, which
3405 has to be freed by the caller */
3406static PyObject *unicode_translate_call_errorhandler(const char *errors,
3407 PyObject **errorHandler,
3408 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003409 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3410 Py_ssize_t startpos, Py_ssize_t endpos,
3411 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003412{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003413 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003414
Martin v. Löwis412fb672006-04-13 06:34:32 +00003415 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003416 PyObject *restuple;
3417 PyObject *resunicode;
3418
3419 if (*errorHandler == NULL) {
3420 *errorHandler = PyCodec_LookupError(errors);
3421 if (*errorHandler == NULL)
3422 return NULL;
3423 }
3424
3425 make_translate_exception(exceptionObject,
3426 unicode, size, startpos, endpos, reason);
3427 if (*exceptionObject == NULL)
3428 return NULL;
3429
3430 restuple = PyObject_CallFunctionObjArgs(
3431 *errorHandler, *exceptionObject, NULL);
3432 if (restuple == NULL)
3433 return NULL;
3434 if (!PyTuple_Check(restuple)) {
3435 PyErr_Format(PyExc_TypeError, &argparse[4]);
3436 Py_DECREF(restuple);
3437 return NULL;
3438 }
3439 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003440 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003441 Py_DECREF(restuple);
3442 return NULL;
3443 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003444 if (i_newpos<0)
3445 *newpos = size+i_newpos;
3446 else
3447 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003448 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003449 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003450 Py_DECREF(restuple);
3451 return NULL;
3452 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003453 Py_INCREF(resunicode);
3454 Py_DECREF(restuple);
3455 return resunicode;
3456}
3457
3458/* Lookup the character ch in the mapping and put the result in result,
3459 which must be decrefed by the caller.
3460 Return 0 on success, -1 on error */
3461static
3462int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3463{
3464 PyObject *w = PyInt_FromLong((long)c);
3465 PyObject *x;
3466
3467 if (w == NULL)
3468 return -1;
3469 x = PyObject_GetItem(mapping, w);
3470 Py_DECREF(w);
3471 if (x == NULL) {
3472 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3473 /* No mapping found means: use 1:1 mapping. */
3474 PyErr_Clear();
3475 *result = NULL;
3476 return 0;
3477 } else
3478 return -1;
3479 }
3480 else if (x == Py_None) {
3481 *result = x;
3482 return 0;
3483 }
3484 else if (PyInt_Check(x)) {
3485 long value = PyInt_AS_LONG(x);
3486 long max = PyUnicode_GetMax();
3487 if (value < 0 || value > max) {
3488 PyErr_Format(PyExc_TypeError,
3489 "character mapping must be in range(0x%lx)", max+1);
3490 Py_DECREF(x);
3491 return -1;
3492 }
3493 *result = x;
3494 return 0;
3495 }
3496 else if (PyUnicode_Check(x)) {
3497 *result = x;
3498 return 0;
3499 }
3500 else {
3501 /* wrong return value */
3502 PyErr_SetString(PyExc_TypeError,
3503 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003504 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505 return -1;
3506 }
3507}
3508/* ensure that *outobj is at least requiredsize characters long,
3509if not reallocate and adjust various state variables.
3510Return 0 on success, -1 on error */
3511static
Walter Dörwald4894c302003-10-24 14:25:28 +00003512int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003513 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003514{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003515 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003516 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003517 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003518 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003519 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003520 if (requiredsize < 2 * oldsize)
3521 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003522 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523 return -1;
3524 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525 }
3526 return 0;
3527}
3528/* lookup the character, put the result in the output string and adjust
3529 various state variables. Return a new reference to the object that
3530 was put in the output buffer in *result, or Py_None, if the mapping was
3531 undefined (in which case no character was written).
3532 The called must decref result.
3533 Return 0 on success, -1 on error. */
3534static
Walter Dörwald4894c302003-10-24 14:25:28 +00003535int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003536 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003537 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538{
Walter Dörwald4894c302003-10-24 14:25:28 +00003539 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540 return -1;
3541 if (*res==NULL) {
3542 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003543 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544 }
3545 else if (*res==Py_None)
3546 ;
3547 else if (PyInt_Check(*res)) {
3548 /* no overflow check, because we know that the space is enough */
3549 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3550 }
3551 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003552 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003553 if (repsize==1) {
3554 /* no overflow check, because we know that the space is enough */
3555 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3556 }
3557 else if (repsize!=0) {
3558 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003559 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003560 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003561 repsize - 1;
3562 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563 return -1;
3564 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3565 *outp += repsize;
3566 }
3567 }
3568 else
3569 return -1;
3570 return 0;
3571}
3572
3573PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003574 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003575 PyObject *mapping,
3576 const char *errors)
3577{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578 /* output object */
3579 PyObject *res = NULL;
3580 /* pointers to the beginning and end+1 of input */
3581 const Py_UNICODE *startp = p;
3582 const Py_UNICODE *endp = p + size;
3583 /* pointer into the output */
3584 Py_UNICODE *str;
3585 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003586 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587 char *reason = "character maps to <undefined>";
3588 PyObject *errorHandler = NULL;
3589 PyObject *exc = NULL;
3590 /* the following variable is used for caching string comparisons
3591 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3592 * 3=ignore, 4=xmlcharrefreplace */
3593 int known_errorHandler = -1;
3594
Guido van Rossumd57fd912000-03-10 22:53:23 +00003595 if (mapping == NULL) {
3596 PyErr_BadArgument();
3597 return NULL;
3598 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599
3600 /* allocate enough for a simple 1:1 translation without
3601 replacements, if we need more, we'll resize */
3602 res = PyUnicode_FromUnicode(NULL, size);
3603 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003604 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003605 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003606 return res;
3607 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609 while (p<endp) {
3610 /* try to encode it */
3611 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003612 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003613 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614 goto onError;
3615 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003616 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003617 if (x!=Py_None) /* it worked => adjust input pointer */
3618 ++p;
3619 else { /* untranslatable character */
3620 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003621 Py_ssize_t repsize;
3622 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003623 Py_UNICODE *uni2;
3624 /* startpos for collecting untranslatable chars */
3625 const Py_UNICODE *collstart = p;
3626 const Py_UNICODE *collend = p+1;
3627 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629 /* find all untranslatable characters */
3630 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003631 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632 goto onError;
3633 Py_XDECREF(x);
3634 if (x!=Py_None)
3635 break;
3636 ++collend;
3637 }
3638 /* cache callback name lookup
3639 * (if not done yet, i.e. it's the first error) */
3640 if (known_errorHandler==-1) {
3641 if ((errors==NULL) || (!strcmp(errors, "strict")))
3642 known_errorHandler = 1;
3643 else if (!strcmp(errors, "replace"))
3644 known_errorHandler = 2;
3645 else if (!strcmp(errors, "ignore"))
3646 known_errorHandler = 3;
3647 else if (!strcmp(errors, "xmlcharrefreplace"))
3648 known_errorHandler = 4;
3649 else
3650 known_errorHandler = 0;
3651 }
3652 switch (known_errorHandler) {
3653 case 1: /* strict */
3654 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3655 goto onError;
3656 case 2: /* replace */
3657 /* No need to check for space, this is a 1:1 replacement */
3658 for (coll = collstart; coll<collend; ++coll)
3659 *str++ = '?';
3660 /* fall through */
3661 case 3: /* ignore */
3662 p = collend;
3663 break;
3664 case 4: /* xmlcharrefreplace */
3665 /* generate replacement (temporarily (mis)uses p) */
3666 for (p = collstart; p < collend; ++p) {
3667 char buffer[2+29+1+1];
3668 char *cp;
3669 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003670 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003671 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3672 goto onError;
3673 for (cp = buffer; *cp; ++cp)
3674 *str++ = *cp;
3675 }
3676 p = collend;
3677 break;
3678 default:
3679 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3680 reason, startp, size, &exc,
3681 collstart-startp, collend-startp, &newpos);
3682 if (repunicode == NULL)
3683 goto onError;
3684 /* generate replacement */
3685 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003686 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003687 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3688 Py_DECREF(repunicode);
3689 goto onError;
3690 }
3691 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3692 *str++ = *uni2;
3693 p = startp + newpos;
3694 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 }
3696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003698 /* Resize if we allocated to much */
3699 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003700 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003701 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003702 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003703 }
3704 Py_XDECREF(exc);
3705 Py_XDECREF(errorHandler);
3706 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003708 onError:
3709 Py_XDECREF(res);
3710 Py_XDECREF(exc);
3711 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003712 return NULL;
3713}
3714
3715PyObject *PyUnicode_Translate(PyObject *str,
3716 PyObject *mapping,
3717 const char *errors)
3718{
3719 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003720
Guido van Rossumd57fd912000-03-10 22:53:23 +00003721 str = PyUnicode_FromObject(str);
3722 if (str == NULL)
3723 goto onError;
3724 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3725 PyUnicode_GET_SIZE(str),
3726 mapping,
3727 errors);
3728 Py_DECREF(str);
3729 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003730
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 onError:
3732 Py_XDECREF(str);
3733 return NULL;
3734}
Tim Petersced69f82003-09-16 20:30:58 +00003735
Guido van Rossum9e896b32000-04-05 20:11:21 +00003736/* --- Decimal Encoder ---------------------------------------------------- */
3737
3738int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003739 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00003740 char *output,
3741 const char *errors)
3742{
3743 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003744 PyObject *errorHandler = NULL;
3745 PyObject *exc = NULL;
3746 const char *encoding = "decimal";
3747 const char *reason = "invalid decimal Unicode string";
3748 /* the following variable is used for caching string comparisons
3749 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3750 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003751
3752 if (output == NULL) {
3753 PyErr_BadArgument();
3754 return -1;
3755 }
3756
3757 p = s;
3758 end = s + length;
3759 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003760 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003761 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003762 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003763 Py_ssize_t repsize;
3764 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765 Py_UNICODE *uni2;
3766 Py_UNICODE *collstart;
3767 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003768
Guido van Rossum9e896b32000-04-05 20:11:21 +00003769 if (Py_UNICODE_ISSPACE(ch)) {
3770 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003771 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003772 continue;
3773 }
3774 decimal = Py_UNICODE_TODECIMAL(ch);
3775 if (decimal >= 0) {
3776 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003777 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003778 continue;
3779 }
Guido van Rossumba477042000-04-06 18:18:10 +00003780 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003781 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003783 continue;
3784 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003785 /* All other characters are considered unencodable */
3786 collstart = p;
3787 collend = p+1;
3788 while (collend < end) {
3789 if ((0 < *collend && *collend < 256) ||
3790 !Py_UNICODE_ISSPACE(*collend) ||
3791 Py_UNICODE_TODECIMAL(*collend))
3792 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003793 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003794 /* cache callback name lookup
3795 * (if not done yet, i.e. it's the first error) */
3796 if (known_errorHandler==-1) {
3797 if ((errors==NULL) || (!strcmp(errors, "strict")))
3798 known_errorHandler = 1;
3799 else if (!strcmp(errors, "replace"))
3800 known_errorHandler = 2;
3801 else if (!strcmp(errors, "ignore"))
3802 known_errorHandler = 3;
3803 else if (!strcmp(errors, "xmlcharrefreplace"))
3804 known_errorHandler = 4;
3805 else
3806 known_errorHandler = 0;
3807 }
3808 switch (known_errorHandler) {
3809 case 1: /* strict */
3810 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3811 goto onError;
3812 case 2: /* replace */
3813 for (p = collstart; p < collend; ++p)
3814 *output++ = '?';
3815 /* fall through */
3816 case 3: /* ignore */
3817 p = collend;
3818 break;
3819 case 4: /* xmlcharrefreplace */
3820 /* generate replacement (temporarily (mis)uses p) */
3821 for (p = collstart; p < collend; ++p)
3822 output += sprintf(output, "&#%d;", (int)*p);
3823 p = collend;
3824 break;
3825 default:
3826 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3827 encoding, reason, s, length, &exc,
3828 collstart-s, collend-s, &newpos);
3829 if (repunicode == NULL)
3830 goto onError;
3831 /* generate replacement */
3832 repsize = PyUnicode_GET_SIZE(repunicode);
3833 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3834 Py_UNICODE ch = *uni2;
3835 if (Py_UNICODE_ISSPACE(ch))
3836 *output++ = ' ';
3837 else {
3838 decimal = Py_UNICODE_TODECIMAL(ch);
3839 if (decimal >= 0)
3840 *output++ = '0' + decimal;
3841 else if (0 < ch && ch < 256)
3842 *output++ = (char)ch;
3843 else {
3844 Py_DECREF(repunicode);
3845 raise_encode_exception(&exc, encoding,
3846 s, length, collstart-s, collend-s, reason);
3847 goto onError;
3848 }
3849 }
3850 }
3851 p = s + newpos;
3852 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003853 }
3854 }
3855 /* 0-terminate the output string */
3856 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003857 Py_XDECREF(exc);
3858 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003859 return 0;
3860
3861 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003862 Py_XDECREF(exc);
3863 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003864 return -1;
3865}
3866
Guido van Rossumd57fd912000-03-10 22:53:23 +00003867/* --- Helpers ------------------------------------------------------------ */
3868
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003869/* fast search/count implementation, based on a mix between boyer-
3870 moore and horspool, with a few more bells and whistles on the top.
3871 for some more background, see: http://effbot.org/stringlib */
3872
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00003873/* note: fastsearch may access s[n], which isn't a problem when using
Fredrik Lundh0c71f882006-05-25 16:46:54 +00003874 Python's ordinary string types, but may cause problems if you're
3875 using this code in other contexts. also, the count mode returns -1
3876 if there cannot possible be a match in the target string, and 0 if
3877 it has actually checked for matches, but didn't find any. callers
3878 beware! */
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00003879
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003880#define FAST_COUNT 0
3881#define FAST_SEARCH 1
3882
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00003883LOCAL(Py_ssize_t)
3884fastsearch(Py_UNICODE* s, Py_ssize_t n, Py_UNICODE* p, Py_ssize_t m, int mode)
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003885{
3886 long mask;
3887 int skip, count = 0;
3888 Py_ssize_t i, j, mlast, w;
3889
3890 w = n - m;
3891
3892 if (w < 0)
3893 return -1;
3894
3895 /* look for special cases */
3896 if (m <= 1) {
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00003897 if (m <= 0)
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003898 return -1;
3899 /* use special case for 1-character strings */
3900 if (mode == FAST_COUNT) {
3901 for (i = 0; i < n; i++)
3902 if (s[i] == p[0])
3903 count++;
3904 return count;
3905 } else {
3906 for (i = 0; i < n; i++)
3907 if (s[i] == p[0])
3908 return i;
3909 }
3910 return -1;
3911 }
3912
3913 mlast = m - 1;
3914
3915 /* create compressed boyer-moore delta 1 table */
3916 skip = mlast - 1;
3917 /* process pattern[:-1] */
3918 for (mask = i = 0; i < mlast; i++) {
3919 mask |= (1 << (p[i] & 0x1F));
3920 if (p[i] == p[mlast])
3921 skip = mlast - i - 1;
3922 }
3923 /* process pattern[-1] outside the loop */
3924 mask |= (1 << (p[mlast] & 0x1F));
3925
3926 for (i = 0; i <= w; i++) {
3927 /* note: using mlast in the skip path slows things down on x86 */
3928 if (s[i+m-1] == p[m-1]) {
3929 /* candidate match */
3930 for (j = 0; j < mlast; j++)
3931 if (s[i+j] != p[j])
3932 break;
3933 if (j == mlast) {
3934 /* got a match! */
3935 if (mode != FAST_COUNT)
3936 return i;
3937 count++;
3938 i = i + mlast;
3939 continue;
3940 }
3941 /* miss: check if next character is part of pattern */
3942 if (!(mask & (1 << (s[i+m] & 0x1F))))
3943 i = i + m;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00003944 else
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003945 i = i + skip;
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003946 } else {
3947 /* skip: check if next character is part of pattern */
3948 if (!(mask & (1 << (s[i+m] & 0x1F))))
3949 i = i + m;
3950 }
3951 }
3952
3953 if (mode != FAST_COUNT)
3954 return -1;
3955 return count;
3956}
3957
3958LOCAL(Py_ssize_t) count(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003959 Py_ssize_t start,
3960 Py_ssize_t end,
3961 PyUnicodeObject *substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003963 Py_ssize_t count = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003965 if (start < 0)
3966 start += self->length;
3967 if (start < 0)
3968 start = 0;
3969 if (end > self->length)
3970 end = self->length;
3971 if (end < 0)
3972 end += self->length;
3973 if (end < 0)
3974 end = 0;
3975
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003976 if (substring->length == 0)
3977 return (end - start + 1);
3978
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003979 count = fastsearch(
3980 PyUnicode_AS_UNICODE(self) + start, end - start,
3981 substring->str, substring->length, FAST_COUNT
3982 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00003983
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003984 if (count < 0)
3985 count = 0; /* no match */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986
3987 return count;
3988}
3989
Martin v. Löwis18e16552006-02-15 17:27:45 +00003990Py_ssize_t PyUnicode_Count(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003992 Py_ssize_t start,
3993 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003994{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003995 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003996
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997 str = PyUnicode_FromObject(str);
3998 if (str == NULL)
3999 return -1;
4000 substr = PyUnicode_FromObject(substr);
4001 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00004002 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003 return -1;
4004 }
Tim Petersced69f82003-09-16 20:30:58 +00004005
Guido van Rossumd57fd912000-03-10 22:53:23 +00004006 result = count((PyUnicodeObject *)str,
4007 start, end,
4008 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00004009
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 Py_DECREF(str);
4011 Py_DECREF(substr);
4012 return result;
4013}
4014
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004015static Py_ssize_t findstring(PyUnicodeObject *self,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004017 Py_ssize_t start,
4018 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004019 int direction)
4020{
4021 if (start < 0)
4022 start += self->length;
4023 if (start < 0)
4024 start = 0;
4025
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026 if (end > self->length)
4027 end = self->length;
4028 if (end < 0)
4029 end += self->length;
4030 if (end < 0)
4031 end = 0;
4032
Guido van Rossum76afbd92002-08-20 17:29:29 +00004033 if (substring->length == 0)
4034 return (direction > 0) ? start : end;
4035
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004036 if (direction > 0) {
4037 Py_ssize_t pos = fastsearch(
4038 PyUnicode_AS_UNICODE(self) + start, end - start,
4039 substring->str, substring->length, FAST_SEARCH
4040 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00004041 if (pos >= 0)
4042 return pos + start;
4043 } else {
4044 end -= substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004045 for (; end >= start; end--)
4046 if (Py_UNICODE_MATCH(self, end, substring))
4047 return end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004049 return -1;
4050}
4051
Martin v. Löwis18e16552006-02-15 17:27:45 +00004052Py_ssize_t PyUnicode_Find(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004054 Py_ssize_t start,
4055 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056 int direction)
4057{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004058 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004059
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 str = PyUnicode_FromObject(str);
4061 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004062 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063 substr = PyUnicode_FromObject(substr);
4064 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004065 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004066 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 }
Tim Petersced69f82003-09-16 20:30:58 +00004068
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 result = findstring((PyUnicodeObject *)str,
4070 (PyUnicodeObject *)substr,
4071 start, end, direction);
4072 Py_DECREF(str);
4073 Py_DECREF(substr);
4074 return result;
4075}
4076
Tim Petersced69f82003-09-16 20:30:58 +00004077static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078int tailmatch(PyUnicodeObject *self,
4079 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004080 Py_ssize_t start,
4081 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 int direction)
4083{
4084 if (start < 0)
4085 start += self->length;
4086 if (start < 0)
4087 start = 0;
4088
4089 if (substring->length == 0)
4090 return 1;
4091
4092 if (end > self->length)
4093 end = self->length;
4094 if (end < 0)
4095 end += self->length;
4096 if (end < 0)
4097 end = 0;
4098
4099 end -= substring->length;
4100 if (end < start)
4101 return 0;
4102
4103 if (direction > 0) {
4104 if (Py_UNICODE_MATCH(self, end, substring))
4105 return 1;
4106 } else {
4107 if (Py_UNICODE_MATCH(self, start, substring))
4108 return 1;
4109 }
4110
4111 return 0;
4112}
4113
Martin v. Löwis18e16552006-02-15 17:27:45 +00004114Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004116 Py_ssize_t start,
4117 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118 int direction)
4119{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004120 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004121
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122 str = PyUnicode_FromObject(str);
4123 if (str == NULL)
4124 return -1;
4125 substr = PyUnicode_FromObject(substr);
4126 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004127 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128 return -1;
4129 }
Tim Petersced69f82003-09-16 20:30:58 +00004130
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 result = tailmatch((PyUnicodeObject *)str,
4132 (PyUnicodeObject *)substr,
4133 start, end, direction);
4134 Py_DECREF(str);
4135 Py_DECREF(substr);
4136 return result;
4137}
4138
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139/* Apply fixfct filter to the Unicode object self and return a
4140 reference to the modified object */
4141
Tim Petersced69f82003-09-16 20:30:58 +00004142static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143PyObject *fixup(PyUnicodeObject *self,
4144 int (*fixfct)(PyUnicodeObject *s))
4145{
4146
4147 PyUnicodeObject *u;
4148
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004149 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150 if (u == NULL)
4151 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004152
4153 Py_UNICODE_COPY(u->str, self->str, self->length);
4154
Tim Peters7a29bd52001-09-12 03:03:31 +00004155 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 /* fixfct should return TRUE if it modified the buffer. If
4157 FALSE, return a reference to the original buffer instead
4158 (to save space, not time) */
4159 Py_INCREF(self);
4160 Py_DECREF(u);
4161 return (PyObject*) self;
4162 }
4163 return (PyObject*) u;
4164}
4165
Tim Petersced69f82003-09-16 20:30:58 +00004166static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167int fixupper(PyUnicodeObject *self)
4168{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004169 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170 Py_UNICODE *s = self->str;
4171 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004172
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173 while (len-- > 0) {
4174 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004175
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176 ch = Py_UNICODE_TOUPPER(*s);
4177 if (ch != *s) {
4178 status = 1;
4179 *s = ch;
4180 }
4181 s++;
4182 }
4183
4184 return status;
4185}
4186
Tim Petersced69f82003-09-16 20:30:58 +00004187static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188int fixlower(PyUnicodeObject *self)
4189{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004190 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191 Py_UNICODE *s = self->str;
4192 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004193
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194 while (len-- > 0) {
4195 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004196
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197 ch = Py_UNICODE_TOLOWER(*s);
4198 if (ch != *s) {
4199 status = 1;
4200 *s = ch;
4201 }
4202 s++;
4203 }
4204
4205 return status;
4206}
4207
Tim Petersced69f82003-09-16 20:30:58 +00004208static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209int fixswapcase(PyUnicodeObject *self)
4210{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004211 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004212 Py_UNICODE *s = self->str;
4213 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004214
Guido van Rossumd57fd912000-03-10 22:53:23 +00004215 while (len-- > 0) {
4216 if (Py_UNICODE_ISUPPER(*s)) {
4217 *s = Py_UNICODE_TOLOWER(*s);
4218 status = 1;
4219 } else if (Py_UNICODE_ISLOWER(*s)) {
4220 *s = Py_UNICODE_TOUPPER(*s);
4221 status = 1;
4222 }
4223 s++;
4224 }
4225
4226 return status;
4227}
4228
Tim Petersced69f82003-09-16 20:30:58 +00004229static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004230int fixcapitalize(PyUnicodeObject *self)
4231{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004232 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004233 Py_UNICODE *s = self->str;
4234 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004235
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004236 if (len == 0)
4237 return 0;
4238 if (Py_UNICODE_ISLOWER(*s)) {
4239 *s = Py_UNICODE_TOUPPER(*s);
4240 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004241 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004242 s++;
4243 while (--len > 0) {
4244 if (Py_UNICODE_ISUPPER(*s)) {
4245 *s = Py_UNICODE_TOLOWER(*s);
4246 status = 1;
4247 }
4248 s++;
4249 }
4250 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004251}
4252
4253static
4254int fixtitle(PyUnicodeObject *self)
4255{
4256 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4257 register Py_UNICODE *e;
4258 int previous_is_cased;
4259
4260 /* Shortcut for single character strings */
4261 if (PyUnicode_GET_SIZE(self) == 1) {
4262 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4263 if (*p != ch) {
4264 *p = ch;
4265 return 1;
4266 }
4267 else
4268 return 0;
4269 }
Tim Petersced69f82003-09-16 20:30:58 +00004270
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271 e = p + PyUnicode_GET_SIZE(self);
4272 previous_is_cased = 0;
4273 for (; p < e; p++) {
4274 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004275
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276 if (previous_is_cased)
4277 *p = Py_UNICODE_TOLOWER(ch);
4278 else
4279 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004280
4281 if (Py_UNICODE_ISLOWER(ch) ||
4282 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283 Py_UNICODE_ISTITLE(ch))
4284 previous_is_cased = 1;
4285 else
4286 previous_is_cased = 0;
4287 }
4288 return 1;
4289}
4290
Tim Peters8ce9f162004-08-27 01:49:32 +00004291PyObject *
4292PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293{
Tim Peters8ce9f162004-08-27 01:49:32 +00004294 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004295 const Py_UNICODE blank = ' ';
4296 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004297 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004298 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004299 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4300 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004301 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4302 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004303 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004304 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004305 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306
Tim Peters05eba1f2004-08-27 21:32:02 +00004307 fseq = PySequence_Fast(seq, "");
4308 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004309 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004310 }
4311
Tim Peters91879ab2004-08-27 22:35:44 +00004312 /* Grrrr. A codec may be invoked to convert str objects to
4313 * Unicode, and so it's possible to call back into Python code
4314 * during PyUnicode_FromObject(), and so it's possible for a sick
4315 * codec to change the size of fseq (if seq is a list). Therefore
4316 * we have to keep refetching the size -- can't assume seqlen
4317 * is invariant.
4318 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004319 seqlen = PySequence_Fast_GET_SIZE(fseq);
4320 /* If empty sequence, return u"". */
4321 if (seqlen == 0) {
4322 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4323 goto Done;
4324 }
4325 /* If singleton sequence with an exact Unicode, return that. */
4326 if (seqlen == 1) {
4327 item = PySequence_Fast_GET_ITEM(fseq, 0);
4328 if (PyUnicode_CheckExact(item)) {
4329 Py_INCREF(item);
4330 res = (PyUnicodeObject *)item;
4331 goto Done;
4332 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004333 }
4334
Tim Peters05eba1f2004-08-27 21:32:02 +00004335 /* At least two items to join, or one that isn't exact Unicode. */
4336 if (seqlen > 1) {
4337 /* Set up sep and seplen -- they're needed. */
4338 if (separator == NULL) {
4339 sep = &blank;
4340 seplen = 1;
4341 }
4342 else {
4343 internal_separator = PyUnicode_FromObject(separator);
4344 if (internal_separator == NULL)
4345 goto onError;
4346 sep = PyUnicode_AS_UNICODE(internal_separator);
4347 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004348 /* In case PyUnicode_FromObject() mutated seq. */
4349 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004350 }
4351 }
4352
4353 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004354 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004355 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004356 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004357 res_p = PyUnicode_AS_UNICODE(res);
4358 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004359
Tim Peters05eba1f2004-08-27 21:32:02 +00004360 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004361 Py_ssize_t itemlen;
4362 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004363
4364 item = PySequence_Fast_GET_ITEM(fseq, i);
4365 /* Convert item to Unicode. */
4366 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4367 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004368 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004369 " %.80s found",
4370 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004371 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004372 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004373 item = PyUnicode_FromObject(item);
4374 if (item == NULL)
4375 goto onError;
4376 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004377
Tim Peters91879ab2004-08-27 22:35:44 +00004378 /* In case PyUnicode_FromObject() mutated seq. */
4379 seqlen = PySequence_Fast_GET_SIZE(fseq);
4380
Tim Peters8ce9f162004-08-27 01:49:32 +00004381 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004382 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004383 new_res_used = res_used + itemlen;
Tim Peters286085c2006-05-22 19:17:04 +00004384 if (new_res_used <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004385 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004386 if (i < seqlen - 1) {
4387 new_res_used += seplen;
Tim Peters286085c2006-05-22 19:17:04 +00004388 if (new_res_used <= 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004389 goto Overflow;
4390 }
4391 if (new_res_used > res_alloc) {
4392 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004393 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004394 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004395 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004396 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004397 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004398 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004399 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004401 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004402 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004404
4405 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004406 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004407 res_p += itemlen;
4408 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004409 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004410 res_p += seplen;
4411 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004413 res_used = new_res_used;
4414 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004415
Tim Peters05eba1f2004-08-27 21:32:02 +00004416 /* Shrink res to match the used area; this probably can't fail,
4417 * but it's cheap to check.
4418 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004419 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004420 goto onError;
4421
4422 Done:
4423 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004424 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004425 return (PyObject *)res;
4426
Tim Peters8ce9f162004-08-27 01:49:32 +00004427 Overflow:
4428 PyErr_SetString(PyExc_OverflowError,
4429 "join() is too long for a Python string");
4430 Py_DECREF(item);
4431 /* fall through */
4432
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004434 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004435 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004436 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437 return NULL;
4438}
4439
Tim Petersced69f82003-09-16 20:30:58 +00004440static
4441PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004442 Py_ssize_t left,
4443 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444 Py_UNICODE fill)
4445{
4446 PyUnicodeObject *u;
4447
4448 if (left < 0)
4449 left = 0;
4450 if (right < 0)
4451 right = 0;
4452
Tim Peters7a29bd52001-09-12 03:03:31 +00004453 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454 Py_INCREF(self);
4455 return self;
4456 }
4457
4458 u = _PyUnicode_New(left + self->length + right);
4459 if (u) {
4460 if (left)
4461 Py_UNICODE_FILL(u->str, fill, left);
4462 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4463 if (right)
4464 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4465 }
4466
4467 return u;
4468}
4469
4470#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004471 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 if (!str) \
4473 goto onError; \
4474 if (PyList_Append(list, str)) { \
4475 Py_DECREF(str); \
4476 goto onError; \
4477 } \
4478 else \
4479 Py_DECREF(str);
4480
4481static
4482PyObject *split_whitespace(PyUnicodeObject *self,
4483 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004484 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004485{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004486 register Py_ssize_t i;
4487 register Py_ssize_t j;
4488 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 PyObject *str;
4490
4491 for (i = j = 0; i < len; ) {
4492 /* find a token */
4493 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4494 i++;
4495 j = i;
4496 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4497 i++;
4498 if (j < i) {
4499 if (maxcount-- <= 0)
4500 break;
4501 SPLIT_APPEND(self->str, j, i);
4502 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4503 i++;
4504 j = i;
4505 }
4506 }
4507 if (j < len) {
4508 SPLIT_APPEND(self->str, j, len);
4509 }
4510 return list;
4511
4512 onError:
4513 Py_DECREF(list);
4514 return NULL;
4515}
4516
4517PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004518 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004520 register Py_ssize_t i;
4521 register Py_ssize_t j;
4522 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004523 PyObject *list;
4524 PyObject *str;
4525 Py_UNICODE *data;
4526
4527 string = PyUnicode_FromObject(string);
4528 if (string == NULL)
4529 return NULL;
4530 data = PyUnicode_AS_UNICODE(string);
4531 len = PyUnicode_GET_SIZE(string);
4532
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533 list = PyList_New(0);
4534 if (!list)
4535 goto onError;
4536
4537 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004538 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004539
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004541 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004542 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543
4544 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004545 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004546 if (i < len) {
4547 if (data[i] == '\r' && i + 1 < len &&
4548 data[i+1] == '\n')
4549 i += 2;
4550 else
4551 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004552 if (keepends)
4553 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554 }
Guido van Rossum86662912000-04-11 15:38:46 +00004555 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004556 j = i;
4557 }
4558 if (j < len) {
4559 SPLIT_APPEND(data, j, len);
4560 }
4561
4562 Py_DECREF(string);
4563 return list;
4564
4565 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004566 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567 Py_DECREF(string);
4568 return NULL;
4569}
4570
Tim Petersced69f82003-09-16 20:30:58 +00004571static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572PyObject *split_char(PyUnicodeObject *self,
4573 PyObject *list,
4574 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004575 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004576{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004577 register Py_ssize_t i;
4578 register Py_ssize_t j;
4579 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004580 PyObject *str;
4581
4582 for (i = j = 0; i < len; ) {
4583 if (self->str[i] == ch) {
4584 if (maxcount-- <= 0)
4585 break;
4586 SPLIT_APPEND(self->str, j, i);
4587 i = j = i + 1;
4588 } else
4589 i++;
4590 }
4591 if (j <= len) {
4592 SPLIT_APPEND(self->str, j, len);
4593 }
4594 return list;
4595
4596 onError:
4597 Py_DECREF(list);
4598 return NULL;
4599}
4600
Tim Petersced69f82003-09-16 20:30:58 +00004601static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004602PyObject *split_substring(PyUnicodeObject *self,
4603 PyObject *list,
4604 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004605 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004606{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004607 register Py_ssize_t i;
4608 register Py_ssize_t j;
4609 Py_ssize_t len = self->length;
4610 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611 PyObject *str;
4612
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004613 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004614 if (Py_UNICODE_MATCH(self, i, substring)) {
4615 if (maxcount-- <= 0)
4616 break;
4617 SPLIT_APPEND(self->str, j, i);
4618 i = j = i + sublen;
4619 } else
4620 i++;
4621 }
4622 if (j <= len) {
4623 SPLIT_APPEND(self->str, j, len);
4624 }
4625 return list;
4626
4627 onError:
4628 Py_DECREF(list);
4629 return NULL;
4630}
4631
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004632static
4633PyObject *rsplit_whitespace(PyUnicodeObject *self,
4634 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004635 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004636{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004637 register Py_ssize_t i;
4638 register Py_ssize_t j;
4639 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004640 PyObject *str;
4641
4642 for (i = j = len - 1; i >= 0; ) {
4643 /* find a token */
4644 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4645 i--;
4646 j = i;
4647 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4648 i--;
4649 if (j > i) {
4650 if (maxcount-- <= 0)
4651 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004652 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004653 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4654 i--;
4655 j = i;
4656 }
4657 }
4658 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004659 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004660 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004661 if (PyList_Reverse(list) < 0)
4662 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004663 return list;
4664
4665 onError:
4666 Py_DECREF(list);
4667 return NULL;
4668}
4669
4670static
4671PyObject *rsplit_char(PyUnicodeObject *self,
4672 PyObject *list,
4673 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004674 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004675{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004676 register Py_ssize_t i;
4677 register Py_ssize_t j;
4678 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004679 PyObject *str;
4680
4681 for (i = j = len - 1; i >= 0; ) {
4682 if (self->str[i] == ch) {
4683 if (maxcount-- <= 0)
4684 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004685 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004686 j = i = i - 1;
4687 } else
4688 i--;
4689 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004690 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004691 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004692 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004693 if (PyList_Reverse(list) < 0)
4694 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004695 return list;
4696
4697 onError:
4698 Py_DECREF(list);
4699 return NULL;
4700}
4701
4702static
4703PyObject *rsplit_substring(PyUnicodeObject *self,
4704 PyObject *list,
4705 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004706 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004707{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004708 register Py_ssize_t i;
4709 register Py_ssize_t j;
4710 Py_ssize_t len = self->length;
4711 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004712 PyObject *str;
4713
4714 for (i = len - sublen, j = len; i >= 0; ) {
4715 if (Py_UNICODE_MATCH(self, i, substring)) {
4716 if (maxcount-- <= 0)
4717 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004718 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004719 j = i;
4720 i -= sublen;
4721 } else
4722 i--;
4723 }
4724 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004725 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004726 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004727 if (PyList_Reverse(list) < 0)
4728 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004729 return list;
4730
4731 onError:
4732 Py_DECREF(list);
4733 return NULL;
4734}
4735
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736#undef SPLIT_APPEND
4737
4738static
4739PyObject *split(PyUnicodeObject *self,
4740 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004741 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742{
4743 PyObject *list;
4744
4745 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004746 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747
4748 list = PyList_New(0);
4749 if (!list)
4750 return NULL;
4751
4752 if (substring == NULL)
4753 return split_whitespace(self,list,maxcount);
4754
4755 else if (substring->length == 1)
4756 return split_char(self,list,substring->str[0],maxcount);
4757
4758 else if (substring->length == 0) {
4759 Py_DECREF(list);
4760 PyErr_SetString(PyExc_ValueError, "empty separator");
4761 return NULL;
4762 }
4763 else
4764 return split_substring(self,list,substring,maxcount);
4765}
4766
Tim Petersced69f82003-09-16 20:30:58 +00004767static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004768PyObject *rsplit(PyUnicodeObject *self,
4769 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004770 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004771{
4772 PyObject *list;
4773
4774 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004775 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004776
4777 list = PyList_New(0);
4778 if (!list)
4779 return NULL;
4780
4781 if (substring == NULL)
4782 return rsplit_whitespace(self,list,maxcount);
4783
4784 else if (substring->length == 1)
4785 return rsplit_char(self,list,substring->str[0],maxcount);
4786
4787 else if (substring->length == 0) {
4788 Py_DECREF(list);
4789 PyErr_SetString(PyExc_ValueError, "empty separator");
4790 return NULL;
4791 }
4792 else
4793 return rsplit_substring(self,list,substring,maxcount);
4794}
4795
4796static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797PyObject *replace(PyUnicodeObject *self,
4798 PyUnicodeObject *str1,
4799 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004800 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801{
4802 PyUnicodeObject *u;
4803
4804 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004805 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806
Fredrik Lundh347ee272006-05-24 16:35:18 +00004807 if (str1->length == str2->length) {
4808 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004809 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004810 if (str1->length == 1) {
4811 /* replace characters */
4812 Py_UNICODE u1, u2;
4813 if (!findchar(self->str, self->length, str1->str[0]))
4814 goto nothing;
4815 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4816 if (!u)
4817 return NULL;
4818 Py_UNICODE_COPY(u->str, self->str, self->length);
4819 u1 = str1->str[0];
4820 u2 = str2->str[0];
4821 for (i = 0; i < u->length; i++)
4822 if (u->str[i] == u1) {
4823 if (--maxcount < 0)
4824 break;
4825 u->str[i] = u2;
4826 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00004828 i = fastsearch(
4829 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00004831 if (i < 0)
4832 goto nothing;
4833 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4834 if (!u)
4835 return NULL;
4836 Py_UNICODE_COPY(u->str, self->str, self->length);
4837 while (i <= self->length - str1->length)
4838 if (Py_UNICODE_MATCH(self, i, str1)) {
4839 if (--maxcount < 0)
4840 break;
4841 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
4842 i += str1->length;
4843 } else
4844 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00004847
Martin v. Löwis18e16552006-02-15 17:27:45 +00004848 Py_ssize_t n, i;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00004849 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850 Py_UNICODE *p;
4851
4852 /* replace strings */
4853 n = count(self, 0, self->length, str1);
4854 if (n > maxcount)
4855 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004856 if (n == 0)
4857 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00004858 /* new_size = self->length + n * (str2->length - str1->length)); */
4859 delta = (str2->length - str1->length);
4860 if (delta == 0) {
4861 new_size = self->length;
4862 } else {
4863 product = n * (str2->length - str1->length);
4864 if ((product / (str2->length - str1->length)) != n) {
4865 PyErr_SetString(PyExc_OverflowError,
4866 "replace string is too long");
4867 return NULL;
4868 }
4869 new_size = self->length + product;
4870 if (new_size < 0) {
4871 PyErr_SetString(PyExc_OverflowError,
4872 "replace string is too long");
4873 return NULL;
4874 }
4875 }
4876 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00004877 if (!u)
4878 return NULL;
4879 i = 0;
4880 p = u->str;
4881 if (str1->length > 0) {
4882 while (i <= self->length - str1->length)
4883 if (Py_UNICODE_MATCH(self, i, str1)) {
4884 /* replace string segment */
4885 Py_UNICODE_COPY(p, str2->str, str2->length);
4886 p += str2->length;
4887 i += str1->length;
4888 if (--n <= 0) {
4889 /* copy remaining part */
4890 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4891 break;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004892 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00004893 } else
4894 *p++ = self->str[i++];
4895 } else {
4896 while (n > 0) {
4897 Py_UNICODE_COPY(p, str2->str, str2->length);
4898 p += str2->length;
4899 if (--n <= 0)
4900 break;
4901 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00004903 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904 }
4905 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004906 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004907
4908nothing:
4909 /* nothing to replace; return original string (when possible) */
4910 if (PyUnicode_CheckExact(self)) {
4911 Py_INCREF(self);
4912 return (PyObject *) self;
4913 }
4914 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915}
4916
4917/* --- Unicode Object Methods --------------------------------------------- */
4918
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004919PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920"S.title() -> unicode\n\
4921\n\
4922Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004923characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924
4925static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004926unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928 return fixup(self, fixtitle);
4929}
4930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004931PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932"S.capitalize() -> unicode\n\
4933\n\
4934Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004935have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936
4937static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004938unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940 return fixup(self, fixcapitalize);
4941}
4942
4943#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004944PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945"S.capwords() -> unicode\n\
4946\n\
4947Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004948normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949
4950static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004951unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952{
4953 PyObject *list;
4954 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004955 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957 /* Split into words */
4958 list = split(self, NULL, -1);
4959 if (!list)
4960 return NULL;
4961
4962 /* Capitalize each word */
4963 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4964 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4965 fixcapitalize);
4966 if (item == NULL)
4967 goto onError;
4968 Py_DECREF(PyList_GET_ITEM(list, i));
4969 PyList_SET_ITEM(list, i, item);
4970 }
4971
4972 /* Join the words to form a new string */
4973 item = PyUnicode_Join(NULL, list);
4974
4975onError:
4976 Py_DECREF(list);
4977 return (PyObject *)item;
4978}
4979#endif
4980
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004981/* Argument converter. Coerces to a single unicode character */
4982
4983static int
4984convert_uc(PyObject *obj, void *addr)
4985{
4986 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4987 PyObject *uniobj;
4988 Py_UNICODE *unistr;
4989
4990 uniobj = PyUnicode_FromObject(obj);
4991 if (uniobj == NULL) {
4992 PyErr_SetString(PyExc_TypeError,
4993 "The fill character cannot be converted to Unicode");
4994 return 0;
4995 }
4996 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4997 PyErr_SetString(PyExc_TypeError,
4998 "The fill character must be exactly one character long");
4999 Py_DECREF(uniobj);
5000 return 0;
5001 }
5002 unistr = PyUnicode_AS_UNICODE(uniobj);
5003 *fillcharloc = unistr[0];
5004 Py_DECREF(uniobj);
5005 return 1;
5006}
5007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005008PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005009"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005011Return S centered in a Unicode string of length width. Padding is\n\
5012done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013
5014static PyObject *
5015unicode_center(PyUnicodeObject *self, PyObject *args)
5016{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005017 Py_ssize_t marg, left;
5018 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005019 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020
Thomas Woutersde017742006-02-16 19:34:37 +00005021 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005022 return NULL;
5023
Tim Peters7a29bd52001-09-12 03:03:31 +00005024 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005025 Py_INCREF(self);
5026 return (PyObject*) self;
5027 }
5028
5029 marg = width - self->length;
5030 left = marg / 2 + (marg & width & 1);
5031
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005032 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033}
5034
Marc-André Lemburge5034372000-08-08 08:04:29 +00005035#if 0
5036
5037/* This code should go into some future Unicode collation support
5038 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005039 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005040
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005041/* speedy UTF-16 code point order comparison */
5042/* gleaned from: */
5043/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5044
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005045static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005046{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005047 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005048 0, 0, 0, 0, 0, 0, 0, 0,
5049 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005050 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005051};
5052
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053static int
5054unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5055{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005056 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005057
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058 Py_UNICODE *s1 = str1->str;
5059 Py_UNICODE *s2 = str2->str;
5060
5061 len1 = str1->length;
5062 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005063
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005065 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005066
5067 c1 = *s1++;
5068 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005069
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005070 if (c1 > (1<<11) * 26)
5071 c1 += utf16Fixup[c1>>11];
5072 if (c2 > (1<<11) * 26)
5073 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005074 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005075
5076 if (c1 != c2)
5077 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005078
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005079 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080 }
5081
5082 return (len1 < len2) ? -1 : (len1 != len2);
5083}
5084
Marc-André Lemburge5034372000-08-08 08:04:29 +00005085#else
5086
5087static int
5088unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5089{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005090 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005091
5092 Py_UNICODE *s1 = str1->str;
5093 Py_UNICODE *s2 = str2->str;
5094
5095 len1 = str1->length;
5096 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005097
Marc-André Lemburge5034372000-08-08 08:04:29 +00005098 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005099 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005100
Fredrik Lundh45714e92001-06-26 16:39:36 +00005101 c1 = *s1++;
5102 c2 = *s2++;
5103
5104 if (c1 != c2)
5105 return (c1 < c2) ? -1 : 1;
5106
Marc-André Lemburge5034372000-08-08 08:04:29 +00005107 len1--; len2--;
5108 }
5109
5110 return (len1 < len2) ? -1 : (len1 != len2);
5111}
5112
5113#endif
5114
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115int PyUnicode_Compare(PyObject *left,
5116 PyObject *right)
5117{
5118 PyUnicodeObject *u = NULL, *v = NULL;
5119 int result;
5120
5121 /* Coerce the two arguments */
5122 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5123 if (u == NULL)
5124 goto onError;
5125 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5126 if (v == NULL)
5127 goto onError;
5128
Thomas Wouters7e474022000-07-16 12:04:32 +00005129 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130 if (v == u) {
5131 Py_DECREF(u);
5132 Py_DECREF(v);
5133 return 0;
5134 }
5135
5136 result = unicode_compare(u, v);
5137
5138 Py_DECREF(u);
5139 Py_DECREF(v);
5140 return result;
5141
5142onError:
5143 Py_XDECREF(u);
5144 Py_XDECREF(v);
5145 return -1;
5146}
5147
Guido van Rossum403d68b2000-03-13 15:55:09 +00005148int PyUnicode_Contains(PyObject *container,
5149 PyObject *element)
5150{
Fredrik Lundh833bf942006-05-23 10:12:21 +00005151 PyUnicodeObject *u, *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005152 Py_ssize_t size;
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00005153 Py_ssize_t pos;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005154
5155 /* Coerce the two arguments */
Fredrik Lundh833bf942006-05-23 10:12:21 +00005156 v = (PyUnicodeObject *) PyUnicode_FromObject(element);
5157 if (!v) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005158 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005159 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005160 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005161 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005162
5163 u = (PyUnicodeObject *) PyUnicode_FromObject(container);
5164 if (!u) {
5165 Py_DECREF(v);
5166 return -1;
5167 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005168
Barry Warsaw817918c2002-08-06 16:58:21 +00005169 size = PyUnicode_GET_SIZE(v);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005170 if (!size) {
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00005171 pos = 0;
Fredrik Lundh833bf942006-05-23 10:12:21 +00005172 goto done;
5173 }
Barry Warsaw817918c2002-08-06 16:58:21 +00005174
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00005175 pos = fastsearch(
5176 PyUnicode_AS_UNICODE(u), PyUnicode_GET_SIZE(u),
5177 PyUnicode_AS_UNICODE(v), size, FAST_SEARCH
5178 );
Guido van Rossum403d68b2000-03-13 15:55:09 +00005179
Fredrik Lundh833bf942006-05-23 10:12:21 +00005180done:
Guido van Rossum403d68b2000-03-13 15:55:09 +00005181 Py_DECREF(u);
5182 Py_DECREF(v);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00005183 return (pos != -1);
Guido van Rossum403d68b2000-03-13 15:55:09 +00005184}
5185
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186/* Concat to string or Unicode object giving a new Unicode object. */
5187
5188PyObject *PyUnicode_Concat(PyObject *left,
5189 PyObject *right)
5190{
5191 PyUnicodeObject *u = NULL, *v = NULL, *w;
5192
5193 /* Coerce the two arguments */
5194 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5195 if (u == NULL)
5196 goto onError;
5197 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5198 if (v == NULL)
5199 goto onError;
5200
5201 /* Shortcuts */
5202 if (v == unicode_empty) {
5203 Py_DECREF(v);
5204 return (PyObject *)u;
5205 }
5206 if (u == unicode_empty) {
5207 Py_DECREF(u);
5208 return (PyObject *)v;
5209 }
5210
5211 /* Concat the two Unicode strings */
5212 w = _PyUnicode_New(u->length + v->length);
5213 if (w == NULL)
5214 goto onError;
5215 Py_UNICODE_COPY(w->str, u->str, u->length);
5216 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5217
5218 Py_DECREF(u);
5219 Py_DECREF(v);
5220 return (PyObject *)w;
5221
5222onError:
5223 Py_XDECREF(u);
5224 Py_XDECREF(v);
5225 return NULL;
5226}
5227
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005228PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229"S.count(sub[, start[, end]]) -> int\n\
5230\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005231Return the number of non-overlapping occurrences of substring sub in\n\
5232Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005233interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234
5235static PyObject *
5236unicode_count(PyUnicodeObject *self, PyObject *args)
5237{
5238 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005239 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005240 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241 PyObject *result;
5242
Guido van Rossumb8872e62000-05-09 14:14:27 +00005243 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5244 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245 return NULL;
5246
5247 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5248 (PyObject *)substring);
5249 if (substring == NULL)
5250 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005251
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252 if (start < 0)
5253 start += self->length;
5254 if (start < 0)
5255 start = 0;
5256 if (end > self->length)
5257 end = self->length;
5258 if (end < 0)
5259 end += self->length;
5260 if (end < 0)
5261 end = 0;
5262
Andrew Dalkeb552c4d2006-05-25 18:03:25 +00005263 result = PyInt_FromSsize_t(count(self, start, end, substring));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264
5265 Py_DECREF(substring);
5266 return result;
5267}
5268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005269PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005270"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005272Encodes S using the codec registered for encoding. encoding defaults\n\
5273to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005274handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005275a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5276'xmlcharrefreplace' as well as any other name registered with\n\
5277codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278
5279static PyObject *
5280unicode_encode(PyUnicodeObject *self, PyObject *args)
5281{
5282 char *encoding = NULL;
5283 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005284 PyObject *v;
5285
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5287 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005288 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005289 if (v == NULL)
5290 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005291 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5292 PyErr_Format(PyExc_TypeError,
5293 "encoder did not return a string/unicode object "
5294 "(type=%.400s)",
5295 v->ob_type->tp_name);
5296 Py_DECREF(v);
5297 return NULL;
5298 }
5299 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005300
5301 onError:
5302 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005303}
5304
5305PyDoc_STRVAR(decode__doc__,
5306"S.decode([encoding[,errors]]) -> string or unicode\n\
5307\n\
5308Decodes S using the codec registered for encoding. encoding defaults\n\
5309to the default encoding. errors may be given to set a different error\n\
5310handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5311a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5312as well as any other name registerd with codecs.register_error that is\n\
5313able to handle UnicodeDecodeErrors.");
5314
5315static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005316unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005317{
5318 char *encoding = NULL;
5319 char *errors = NULL;
5320 PyObject *v;
5321
5322 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5323 return NULL;
5324 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005325 if (v == NULL)
5326 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005327 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5328 PyErr_Format(PyExc_TypeError,
5329 "decoder did not return a string/unicode object "
5330 "(type=%.400s)",
5331 v->ob_type->tp_name);
5332 Py_DECREF(v);
5333 return NULL;
5334 }
5335 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005336
5337 onError:
5338 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339}
5340
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005341PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342"S.expandtabs([tabsize]) -> unicode\n\
5343\n\
5344Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005345If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346
5347static PyObject*
5348unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5349{
5350 Py_UNICODE *e;
5351 Py_UNICODE *p;
5352 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005353 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 PyUnicodeObject *u;
5355 int tabsize = 8;
5356
5357 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5358 return NULL;
5359
Thomas Wouters7e474022000-07-16 12:04:32 +00005360 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 i = j = 0;
5362 e = self->str + self->length;
5363 for (p = self->str; p < e; p++)
5364 if (*p == '\t') {
5365 if (tabsize > 0)
5366 j += tabsize - (j % tabsize);
5367 }
5368 else {
5369 j++;
5370 if (*p == '\n' || *p == '\r') {
5371 i += j;
5372 j = 0;
5373 }
5374 }
5375
5376 /* Second pass: create output string and fill it */
5377 u = _PyUnicode_New(i + j);
5378 if (!u)
5379 return NULL;
5380
5381 j = 0;
5382 q = u->str;
5383
5384 for (p = self->str; p < e; p++)
5385 if (*p == '\t') {
5386 if (tabsize > 0) {
5387 i = tabsize - (j % tabsize);
5388 j += i;
5389 while (i--)
5390 *q++ = ' ';
5391 }
5392 }
5393 else {
5394 j++;
5395 *q++ = *p;
5396 if (*p == '\n' || *p == '\r')
5397 j = 0;
5398 }
5399
5400 return (PyObject*) u;
5401}
5402
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005403PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404"S.find(sub [,start [,end]]) -> int\n\
5405\n\
5406Return the lowest index in S where substring sub is found,\n\
5407such that sub is contained within s[start,end]. Optional\n\
5408arguments start and end are interpreted as in slice notation.\n\
5409\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005410Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411
5412static PyObject *
5413unicode_find(PyUnicodeObject *self, PyObject *args)
5414{
5415 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005416 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005417 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 PyObject *result;
5419
Guido van Rossumb8872e62000-05-09 14:14:27 +00005420 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5421 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 return NULL;
5423 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5424 (PyObject *)substring);
5425 if (substring == NULL)
5426 return NULL;
5427
Martin v. Löwis18e16552006-02-15 17:27:45 +00005428 result = PyInt_FromSsize_t(findstring(self, substring, start, end, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429
5430 Py_DECREF(substring);
5431 return result;
5432}
5433
5434static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005435unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436{
5437 if (index < 0 || index >= self->length) {
5438 PyErr_SetString(PyExc_IndexError, "string index out of range");
5439 return NULL;
5440 }
5441
5442 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5443}
5444
5445static long
5446unicode_hash(PyUnicodeObject *self)
5447{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005448 /* Since Unicode objects compare equal to their ASCII string
5449 counterparts, they should use the individual character values
5450 as basis for their hash value. This is needed to assure that
5451 strings and Unicode objects behave in the same way as
5452 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453
Martin v. Löwis18e16552006-02-15 17:27:45 +00005454 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005455 register Py_UNICODE *p;
5456 register long x;
5457
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 if (self->hash != -1)
5459 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005460 len = PyUnicode_GET_SIZE(self);
5461 p = PyUnicode_AS_UNICODE(self);
5462 x = *p << 7;
5463 while (--len >= 0)
5464 x = (1000003*x) ^ *p++;
5465 x ^= PyUnicode_GET_SIZE(self);
5466 if (x == -1)
5467 x = -2;
5468 self->hash = x;
5469 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470}
5471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005472PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473"S.index(sub [,start [,end]]) -> int\n\
5474\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005475Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476
5477static PyObject *
5478unicode_index(PyUnicodeObject *self, PyObject *args)
5479{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005480 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005482 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005483 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484
Guido van Rossumb8872e62000-05-09 14:14:27 +00005485 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5486 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005488
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5490 (PyObject *)substring);
5491 if (substring == NULL)
5492 return NULL;
5493
5494 result = findstring(self, substring, start, end, 1);
5495
5496 Py_DECREF(substring);
5497 if (result < 0) {
5498 PyErr_SetString(PyExc_ValueError, "substring not found");
5499 return NULL;
5500 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005501 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502}
5503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005504PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005505"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005507Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005508at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509
5510static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005511unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512{
5513 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5514 register const Py_UNICODE *e;
5515 int cased;
5516
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 /* Shortcut for single character strings */
5518 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005519 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005521 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005522 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005523 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005524
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 e = p + PyUnicode_GET_SIZE(self);
5526 cased = 0;
5527 for (; p < e; p++) {
5528 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005529
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005531 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532 else if (!cased && Py_UNICODE_ISLOWER(ch))
5533 cased = 1;
5534 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005535 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536}
5537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005538PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005539"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005541Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005542at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543
5544static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005545unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546{
5547 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5548 register const Py_UNICODE *e;
5549 int cased;
5550
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 /* Shortcut for single character strings */
5552 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005553 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005555 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005556 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005557 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005558
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 e = p + PyUnicode_GET_SIZE(self);
5560 cased = 0;
5561 for (; p < e; p++) {
5562 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005563
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005565 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 else if (!cased && Py_UNICODE_ISUPPER(ch))
5567 cased = 1;
5568 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005569 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570}
5571
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005572PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005573"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005575Return True if S is a titlecased string and there is at least one\n\
5576character in S, i.e. upper- and titlecase characters may only\n\
5577follow uncased characters and lowercase characters only cased ones.\n\
5578Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579
5580static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005581unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582{
5583 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5584 register const Py_UNICODE *e;
5585 int cased, previous_is_cased;
5586
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587 /* Shortcut for single character strings */
5588 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005589 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5590 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005592 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005593 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005594 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005595
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596 e = p + PyUnicode_GET_SIZE(self);
5597 cased = 0;
5598 previous_is_cased = 0;
5599 for (; p < e; p++) {
5600 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005601
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5603 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005604 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 previous_is_cased = 1;
5606 cased = 1;
5607 }
5608 else if (Py_UNICODE_ISLOWER(ch)) {
5609 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005610 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 previous_is_cased = 1;
5612 cased = 1;
5613 }
5614 else
5615 previous_is_cased = 0;
5616 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005617 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618}
5619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005620PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005621"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005623Return True if all characters in S are whitespace\n\
5624and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625
5626static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005627unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628{
5629 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5630 register const Py_UNICODE *e;
5631
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 /* Shortcut for single character strings */
5633 if (PyUnicode_GET_SIZE(self) == 1 &&
5634 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005635 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005637 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005638 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005639 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005640
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 e = p + PyUnicode_GET_SIZE(self);
5642 for (; p < e; p++) {
5643 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005644 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005646 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647}
5648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005649PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005650"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005651\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005652Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005653and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005654
5655static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005656unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005657{
5658 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5659 register const Py_UNICODE *e;
5660
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005661 /* Shortcut for single character strings */
5662 if (PyUnicode_GET_SIZE(self) == 1 &&
5663 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005664 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005665
5666 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005667 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005668 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005669
5670 e = p + PyUnicode_GET_SIZE(self);
5671 for (; p < e; p++) {
5672 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005673 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005674 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005675 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005676}
5677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005678PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005679"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005680\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005681Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005682and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005683
5684static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005685unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005686{
5687 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5688 register const Py_UNICODE *e;
5689
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005690 /* Shortcut for single character strings */
5691 if (PyUnicode_GET_SIZE(self) == 1 &&
5692 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005693 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005694
5695 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005696 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005697 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005698
5699 e = p + PyUnicode_GET_SIZE(self);
5700 for (; p < e; p++) {
5701 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005702 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005703 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005704 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005705}
5706
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005707PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005708"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005710Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005711False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712
5713static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005714unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715{
5716 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5717 register const Py_UNICODE *e;
5718
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 /* Shortcut for single character strings */
5720 if (PyUnicode_GET_SIZE(self) == 1 &&
5721 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005722 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005724 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005725 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005726 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005727
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 e = p + PyUnicode_GET_SIZE(self);
5729 for (; p < e; p++) {
5730 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005731 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005733 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734}
5735
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005736PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005737"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005739Return True if all characters in S are digits\n\
5740and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741
5742static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005743unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744{
5745 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5746 register const Py_UNICODE *e;
5747
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 /* Shortcut for single character strings */
5749 if (PyUnicode_GET_SIZE(self) == 1 &&
5750 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005751 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005753 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005754 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005755 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005756
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 e = p + PyUnicode_GET_SIZE(self);
5758 for (; p < e; p++) {
5759 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005760 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005762 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763}
5764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005765PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005766"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005768Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005769False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770
5771static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005772unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773{
5774 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5775 register const Py_UNICODE *e;
5776
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777 /* Shortcut for single character strings */
5778 if (PyUnicode_GET_SIZE(self) == 1 &&
5779 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005780 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005782 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005783 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005784 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005785
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786 e = p + PyUnicode_GET_SIZE(self);
5787 for (; p < e; p++) {
5788 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005789 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005791 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792}
5793
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005794PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795"S.join(sequence) -> unicode\n\
5796\n\
5797Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005798sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799
5800static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005801unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005803 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804}
5805
Martin v. Löwis18e16552006-02-15 17:27:45 +00005806static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807unicode_length(PyUnicodeObject *self)
5808{
5809 return self->length;
5810}
5811
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005812PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005813"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814\n\
5815Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005816done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817
5818static PyObject *
5819unicode_ljust(PyUnicodeObject *self, PyObject *args)
5820{
Martin v. Löwis412fb672006-04-13 06:34:32 +00005821 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005822 Py_UNICODE fillchar = ' ';
5823
Martin v. Löwis412fb672006-04-13 06:34:32 +00005824 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 return NULL;
5826
Tim Peters7a29bd52001-09-12 03:03:31 +00005827 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828 Py_INCREF(self);
5829 return (PyObject*) self;
5830 }
5831
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005832 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833}
5834
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005835PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836"S.lower() -> unicode\n\
5837\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005838Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839
5840static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005841unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 return fixup(self, fixlower);
5844}
5845
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005846#define LEFTSTRIP 0
5847#define RIGHTSTRIP 1
5848#define BOTHSTRIP 2
5849
5850/* Arrays indexed by above */
5851static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5852
5853#define STRIPNAME(i) (stripformat[i]+3)
5854
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005855/* externally visible for str.strip(unicode) */
5856PyObject *
5857_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5858{
5859 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005860 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005861 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005862 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
5863 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005864
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005865 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
5866
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005867 i = 0;
5868 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005869 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
5870 i++;
5871 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005872 }
5873
5874 j = len;
5875 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005876 do {
5877 j--;
5878 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
5879 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005880 }
5881
5882 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005883 Py_INCREF(self);
5884 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005885 }
5886 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005887 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005888}
5889
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890
5891static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005892do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005894 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005895 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005896
5897 i = 0;
5898 if (striptype != RIGHTSTRIP) {
5899 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5900 i++;
5901 }
5902 }
5903
5904 j = len;
5905 if (striptype != LEFTSTRIP) {
5906 do {
5907 j--;
5908 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5909 j++;
5910 }
5911
5912 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5913 Py_INCREF(self);
5914 return (PyObject*)self;
5915 }
5916 else
5917 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918}
5919
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005920
5921static PyObject *
5922do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5923{
5924 PyObject *sep = NULL;
5925
5926 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5927 return NULL;
5928
5929 if (sep != NULL && sep != Py_None) {
5930 if (PyUnicode_Check(sep))
5931 return _PyUnicode_XStrip(self, striptype, sep);
5932 else if (PyString_Check(sep)) {
5933 PyObject *res;
5934 sep = PyUnicode_FromObject(sep);
5935 if (sep==NULL)
5936 return NULL;
5937 res = _PyUnicode_XStrip(self, striptype, sep);
5938 Py_DECREF(sep);
5939 return res;
5940 }
5941 else {
5942 PyErr_Format(PyExc_TypeError,
5943 "%s arg must be None, unicode or str",
5944 STRIPNAME(striptype));
5945 return NULL;
5946 }
5947 }
5948
5949 return do_strip(self, striptype);
5950}
5951
5952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005953PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005954"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005955\n\
5956Return a copy of the string S with leading and trailing\n\
5957whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005958If chars is given and not None, remove characters in chars instead.\n\
5959If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005960
5961static PyObject *
5962unicode_strip(PyUnicodeObject *self, PyObject *args)
5963{
5964 if (PyTuple_GET_SIZE(args) == 0)
5965 return do_strip(self, BOTHSTRIP); /* Common case */
5966 else
5967 return do_argstrip(self, BOTHSTRIP, args);
5968}
5969
5970
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005971PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005972"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005973\n\
5974Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005975If chars is given and not None, remove characters in chars instead.\n\
5976If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005977
5978static PyObject *
5979unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5980{
5981 if (PyTuple_GET_SIZE(args) == 0)
5982 return do_strip(self, LEFTSTRIP); /* Common case */
5983 else
5984 return do_argstrip(self, LEFTSTRIP, args);
5985}
5986
5987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005988PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005989"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005990\n\
5991Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005992If chars is given and not None, remove characters in chars instead.\n\
5993If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005994
5995static PyObject *
5996unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5997{
5998 if (PyTuple_GET_SIZE(args) == 0)
5999 return do_strip(self, RIGHTSTRIP); /* Common case */
6000 else
6001 return do_argstrip(self, RIGHTSTRIP, args);
6002}
6003
6004
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006006unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007{
6008 PyUnicodeObject *u;
6009 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006010 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006011 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012
6013 if (len < 0)
6014 len = 0;
6015
Tim Peters7a29bd52001-09-12 03:03:31 +00006016 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 /* no repeat, return original string */
6018 Py_INCREF(str);
6019 return (PyObject*) str;
6020 }
Tim Peters8f422462000-09-09 06:13:41 +00006021
6022 /* ensure # of chars needed doesn't overflow int and # of bytes
6023 * needed doesn't overflow size_t
6024 */
6025 nchars = len * str->length;
6026 if (len && nchars / len != str->length) {
6027 PyErr_SetString(PyExc_OverflowError,
6028 "repeated string is too long");
6029 return NULL;
6030 }
6031 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6032 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6033 PyErr_SetString(PyExc_OverflowError,
6034 "repeated string is too long");
6035 return NULL;
6036 }
6037 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038 if (!u)
6039 return NULL;
6040
6041 p = u->str;
6042
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006043 if (str->length == 1 && len > 0) {
6044 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006045 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006046 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006047 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006048 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006049 done = str->length;
6050 }
6051 while (done < nchars) {
6052 int n = (done <= nchars-done) ? done : nchars-done;
6053 Py_UNICODE_COPY(p+done, p, n);
6054 done += n;
6055 }
6056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057
6058 return (PyObject*) u;
6059}
6060
6061PyObject *PyUnicode_Replace(PyObject *obj,
6062 PyObject *subobj,
6063 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006064 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065{
6066 PyObject *self;
6067 PyObject *str1;
6068 PyObject *str2;
6069 PyObject *result;
6070
6071 self = PyUnicode_FromObject(obj);
6072 if (self == NULL)
6073 return NULL;
6074 str1 = PyUnicode_FromObject(subobj);
6075 if (str1 == NULL) {
6076 Py_DECREF(self);
6077 return NULL;
6078 }
6079 str2 = PyUnicode_FromObject(replobj);
6080 if (str2 == NULL) {
6081 Py_DECREF(self);
6082 Py_DECREF(str1);
6083 return NULL;
6084 }
Tim Petersced69f82003-09-16 20:30:58 +00006085 result = replace((PyUnicodeObject *)self,
6086 (PyUnicodeObject *)str1,
6087 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 maxcount);
6089 Py_DECREF(self);
6090 Py_DECREF(str1);
6091 Py_DECREF(str2);
6092 return result;
6093}
6094
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006095PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096"S.replace (old, new[, maxsplit]) -> unicode\n\
6097\n\
6098Return a copy of S with all occurrences of substring\n\
6099old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006100given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101
6102static PyObject*
6103unicode_replace(PyUnicodeObject *self, PyObject *args)
6104{
6105 PyUnicodeObject *str1;
6106 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006107 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 PyObject *result;
6109
Martin v. Löwis18e16552006-02-15 17:27:45 +00006110 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 return NULL;
6112 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6113 if (str1 == NULL)
6114 return NULL;
6115 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006116 if (str2 == NULL) {
6117 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006119 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120
6121 result = replace(self, str1, str2, maxcount);
6122
6123 Py_DECREF(str1);
6124 Py_DECREF(str2);
6125 return result;
6126}
6127
6128static
6129PyObject *unicode_repr(PyObject *unicode)
6130{
6131 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6132 PyUnicode_GET_SIZE(unicode),
6133 1);
6134}
6135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006136PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137"S.rfind(sub [,start [,end]]) -> int\n\
6138\n\
6139Return the highest index in S where substring sub is found,\n\
6140such that sub is contained within s[start,end]. Optional\n\
6141arguments start and end are interpreted as in slice notation.\n\
6142\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006143Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144
6145static PyObject *
6146unicode_rfind(PyUnicodeObject *self, PyObject *args)
6147{
6148 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006149 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006150 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 PyObject *result;
6152
Guido van Rossumb8872e62000-05-09 14:14:27 +00006153 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6154 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155 return NULL;
6156 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6157 (PyObject *)substring);
6158 if (substring == NULL)
6159 return NULL;
6160
Martin v. Löwis18e16552006-02-15 17:27:45 +00006161 result = PyInt_FromSsize_t(findstring(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162
6163 Py_DECREF(substring);
6164 return result;
6165}
6166
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006167PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168"S.rindex(sub [,start [,end]]) -> int\n\
6169\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006170Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171
6172static PyObject *
6173unicode_rindex(PyUnicodeObject *self, PyObject *args)
6174{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006175 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006177 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006178 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179
Guido van Rossumb8872e62000-05-09 14:14:27 +00006180 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6181 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 return NULL;
6183 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6184 (PyObject *)substring);
6185 if (substring == NULL)
6186 return NULL;
6187
6188 result = findstring(self, substring, start, end, -1);
6189
6190 Py_DECREF(substring);
6191 if (result < 0) {
6192 PyErr_SetString(PyExc_ValueError, "substring not found");
6193 return NULL;
6194 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006195 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196}
6197
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006198PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006199"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200\n\
6201Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006202done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203
6204static PyObject *
6205unicode_rjust(PyUnicodeObject *self, PyObject *args)
6206{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006207 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006208 Py_UNICODE fillchar = ' ';
6209
Martin v. Löwis412fb672006-04-13 06:34:32 +00006210 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 return NULL;
6212
Tim Peters7a29bd52001-09-12 03:03:31 +00006213 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 Py_INCREF(self);
6215 return (PyObject*) self;
6216 }
6217
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006218 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219}
6220
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006222unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223{
6224 /* standard clamping */
6225 if (start < 0)
6226 start = 0;
6227 if (end < 0)
6228 end = 0;
6229 if (end > self->length)
6230 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006231 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 /* full slice, return original string */
6233 Py_INCREF(self);
6234 return (PyObject*) self;
6235 }
6236 if (start > end)
6237 start = end;
6238 /* copy slice */
6239 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6240 end - start);
6241}
6242
6243PyObject *PyUnicode_Split(PyObject *s,
6244 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006245 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246{
6247 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006248
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249 s = PyUnicode_FromObject(s);
6250 if (s == NULL)
6251 return NULL;
6252 if (sep != NULL) {
6253 sep = PyUnicode_FromObject(sep);
6254 if (sep == NULL) {
6255 Py_DECREF(s);
6256 return NULL;
6257 }
6258 }
6259
6260 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6261
6262 Py_DECREF(s);
6263 Py_XDECREF(sep);
6264 return result;
6265}
6266
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006267PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268"S.split([sep [,maxsplit]]) -> list of strings\n\
6269\n\
6270Return a list of the words in S, using sep as the\n\
6271delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006272splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006273any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274
6275static PyObject*
6276unicode_split(PyUnicodeObject *self, PyObject *args)
6277{
6278 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006279 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280
Martin v. Löwis18e16552006-02-15 17:27:45 +00006281 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282 return NULL;
6283
6284 if (substring == Py_None)
6285 return split(self, NULL, maxcount);
6286 else if (PyUnicode_Check(substring))
6287 return split(self, (PyUnicodeObject *)substring, maxcount);
6288 else
6289 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6290}
6291
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006292PyObject *
6293PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6294{
6295 PyObject* str_obj;
6296 PyObject* sep_obj;
6297 Py_UNICODE *str, *sep;
6298 Py_ssize_t len, sep_len, pos;
6299 PyObject* out;
6300
6301 str_obj = PyUnicode_FromObject(str_in);
6302 if (!str_obj)
6303 return NULL;
6304 sep_obj = PyUnicode_FromObject(sep_in);
6305 if (!sep_obj)
6306 goto error;
6307
6308 str = PyUnicode_AS_UNICODE(str_obj);
6309 len = PyUnicode_GET_SIZE(str_obj);
6310
6311 sep = PyUnicode_AS_UNICODE(sep_obj);
6312 sep_len = PyUnicode_GET_SIZE(sep_obj);
6313
6314 if (sep_len == 0) {
6315 PyErr_SetString(PyExc_ValueError, "empty separator");
6316 goto error;
6317 }
6318
6319 out = PyTuple_New(3);
6320 if (!out)
6321 goto error;
6322
6323 pos = fastsearch(str, len, sep, sep_len, FAST_SEARCH);
6324 if (pos < 0) {
6325 Py_INCREF(str_obj);
6326 PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj);
6327 Py_INCREF(unicode_empty);
6328 PyTuple_SET_ITEM(out, 1, (PyObject*) unicode_empty);
6329 Py_INCREF(unicode_empty);
6330 PyTuple_SET_ITEM(out, 2, (PyObject*) unicode_empty);
6331 } else {
6332 PyObject* obj;
6333 PyTuple_SET_ITEM(out, 0, PyUnicode_FromUnicode(str, pos));
6334 Py_INCREF(sep_obj);
6335 PyTuple_SET_ITEM(out, 1, sep_obj);
6336 obj = PyUnicode_FromUnicode(str + sep_len + pos, len - sep_len - pos);
6337 PyTuple_SET_ITEM(out, 2, obj);
6338 if (PyErr_Occurred()) {
6339 Py_DECREF(out);
6340 goto error;
6341 }
6342 }
6343
6344 return out;
6345
6346error:
6347 Py_XDECREF(sep_obj);
6348 Py_DECREF(str_obj);
6349 return NULL;
6350}
6351
6352PyDoc_STRVAR(partition__doc__,
6353"S.partition(sep) -> (head, sep, tail)\n\
6354\n\
6355Searches for the separator sep in S, and returns the part before it,\n\
6356the separator itself, and the part after it. If the separator is not\n\
6357found, returns S and two empty strings.");
6358
6359static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00006360unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006361{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006362 return PyUnicode_Partition((PyObject *)self, separator);
6363}
6364
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006365PyObject *PyUnicode_RSplit(PyObject *s,
6366 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006367 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006368{
6369 PyObject *result;
6370
6371 s = PyUnicode_FromObject(s);
6372 if (s == NULL)
6373 return NULL;
6374 if (sep != NULL) {
6375 sep = PyUnicode_FromObject(sep);
6376 if (sep == NULL) {
6377 Py_DECREF(s);
6378 return NULL;
6379 }
6380 }
6381
6382 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6383
6384 Py_DECREF(s);
6385 Py_XDECREF(sep);
6386 return result;
6387}
6388
6389PyDoc_STRVAR(rsplit__doc__,
6390"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6391\n\
6392Return a list of the words in S, using sep as the\n\
6393delimiter string, starting at the end of the string and\n\
6394working to the front. If maxsplit is given, at most maxsplit\n\
6395splits are done. If sep is not specified, any whitespace string\n\
6396is a separator.");
6397
6398static PyObject*
6399unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6400{
6401 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006402 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006403
Martin v. Löwis18e16552006-02-15 17:27:45 +00006404 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006405 return NULL;
6406
6407 if (substring == Py_None)
6408 return rsplit(self, NULL, maxcount);
6409 else if (PyUnicode_Check(substring))
6410 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6411 else
6412 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6413}
6414
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006415PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006416"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417\n\
6418Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006419Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006420is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421
6422static PyObject*
6423unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6424{
Guido van Rossum86662912000-04-11 15:38:46 +00006425 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426
Guido van Rossum86662912000-04-11 15:38:46 +00006427 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428 return NULL;
6429
Guido van Rossum86662912000-04-11 15:38:46 +00006430 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431}
6432
6433static
6434PyObject *unicode_str(PyUnicodeObject *self)
6435{
Fred Drakee4315f52000-05-09 19:53:39 +00006436 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437}
6438
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006439PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440"S.swapcase() -> unicode\n\
6441\n\
6442Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006443and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444
6445static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006446unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 return fixup(self, fixswapcase);
6449}
6450
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006451PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452"S.translate(table) -> unicode\n\
6453\n\
6454Return a copy of the string S, where all characters have been mapped\n\
6455through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006456Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6457Unmapped characters are left untouched. Characters mapped to None\n\
6458are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459
6460static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006461unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462{
Tim Petersced69f82003-09-16 20:30:58 +00006463 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006465 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 "ignore");
6467}
6468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006469PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470"S.upper() -> unicode\n\
6471\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006472Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473
6474static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006475unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477 return fixup(self, fixupper);
6478}
6479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006480PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481"S.zfill(width) -> unicode\n\
6482\n\
6483Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006484of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485
6486static PyObject *
6487unicode_zfill(PyUnicodeObject *self, PyObject *args)
6488{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006489 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 PyUnicodeObject *u;
6491
Martin v. Löwis18e16552006-02-15 17:27:45 +00006492 Py_ssize_t width;
6493 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494 return NULL;
6495
6496 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006497 if (PyUnicode_CheckExact(self)) {
6498 Py_INCREF(self);
6499 return (PyObject*) self;
6500 }
6501 else
6502 return PyUnicode_FromUnicode(
6503 PyUnicode_AS_UNICODE(self),
6504 PyUnicode_GET_SIZE(self)
6505 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 }
6507
6508 fill = width - self->length;
6509
6510 u = pad(self, fill, 0, '0');
6511
Walter Dörwald068325e2002-04-15 13:36:47 +00006512 if (u == NULL)
6513 return NULL;
6514
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 if (u->str[fill] == '+' || u->str[fill] == '-') {
6516 /* move sign to beginning of string */
6517 u->str[0] = u->str[fill];
6518 u->str[fill] = '0';
6519 }
6520
6521 return (PyObject*) u;
6522}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523
6524#if 0
6525static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006526unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528 return PyInt_FromLong(unicode_freelist_size);
6529}
6530#endif
6531
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006532PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006533"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006535Return True if S starts with the specified prefix, False otherwise.\n\
6536With optional start, test S beginning at that position.\n\
6537With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538
6539static PyObject *
6540unicode_startswith(PyUnicodeObject *self,
6541 PyObject *args)
6542{
6543 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006544 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006545 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 PyObject *result;
6547
Guido van Rossumb8872e62000-05-09 14:14:27 +00006548 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6549 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 return NULL;
6551 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6552 (PyObject *)substring);
6553 if (substring == NULL)
6554 return NULL;
6555
Guido van Rossum77f6a652002-04-03 22:41:51 +00006556 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557
6558 Py_DECREF(substring);
6559 return result;
6560}
6561
6562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006563PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006564"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006566Return True if S ends with the specified suffix, False otherwise.\n\
6567With optional start, test S beginning at that position.\n\
6568With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569
6570static PyObject *
6571unicode_endswith(PyUnicodeObject *self,
6572 PyObject *args)
6573{
6574 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006575 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006576 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 PyObject *result;
6578
Guido van Rossumb8872e62000-05-09 14:14:27 +00006579 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6580 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 return NULL;
6582 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6583 (PyObject *)substring);
6584 if (substring == NULL)
6585 return NULL;
6586
Guido van Rossum77f6a652002-04-03 22:41:51 +00006587 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588
6589 Py_DECREF(substring);
6590 return result;
6591}
6592
6593
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006594
6595static PyObject *
6596unicode_getnewargs(PyUnicodeObject *v)
6597{
6598 return Py_BuildValue("(u#)", v->str, v->length);
6599}
6600
6601
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602static PyMethodDef unicode_methods[] = {
6603
6604 /* Order is according to common usage: often used methods should
6605 appear first, since lookup is done sequentially. */
6606
Georg Brandlecdc0a92006-03-30 12:19:07 +00006607 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006608 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6609 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006610 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006611 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6612 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6613 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6614 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6615 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6616 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6617 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00006618 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006619 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6620 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6621 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006622 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006623 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006624/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6625 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6626 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6627 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006628 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006629 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006630 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006631 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6632 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6633 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6634 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6635 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6636 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6637 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6638 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6639 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6640 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6641 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6642 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6643 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6644 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006645 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006646#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006647 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648#endif
6649
6650#if 0
6651 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006652 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653#endif
6654
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006655 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656 {NULL, NULL}
6657};
6658
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006659static PyObject *
6660unicode_mod(PyObject *v, PyObject *w)
6661{
6662 if (!PyUnicode_Check(v)) {
6663 Py_INCREF(Py_NotImplemented);
6664 return Py_NotImplemented;
6665 }
6666 return PyUnicode_Format(v, w);
6667}
6668
6669static PyNumberMethods unicode_as_number = {
6670 0, /*nb_add*/
6671 0, /*nb_subtract*/
6672 0, /*nb_multiply*/
6673 0, /*nb_divide*/
6674 unicode_mod, /*nb_remainder*/
6675};
6676
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006678 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00006679 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006680 (ssizeargfunc) unicode_repeat, /* sq_repeat */
6681 (ssizeargfunc) unicode_getitem, /* sq_item */
6682 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 0, /* sq_ass_item */
6684 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00006685 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686};
6687
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006688#define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
6689
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006690static PyObject*
6691unicode_subscript(PyUnicodeObject* self, PyObject* item)
6692{
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006693 PyNumberMethods *nb = item->ob_type->tp_as_number;
6694 if (nb != NULL && HASINDEX(item) && nb->nb_index != NULL) {
6695 Py_ssize_t i = nb->nb_index(item);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006696 if (i == -1 && PyErr_Occurred())
6697 return NULL;
6698 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006699 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006700 return unicode_getitem(self, i);
6701 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006702 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006703 Py_UNICODE* source_buf;
6704 Py_UNICODE* result_buf;
6705 PyObject* result;
6706
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006707 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006708 &start, &stop, &step, &slicelength) < 0) {
6709 return NULL;
6710 }
6711
6712 if (slicelength <= 0) {
6713 return PyUnicode_FromUnicode(NULL, 0);
6714 } else {
6715 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00006716 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
6717 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006718
6719 if (result_buf == NULL)
6720 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006721
6722 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6723 result_buf[i] = source_buf[cur];
6724 }
Tim Petersced69f82003-09-16 20:30:58 +00006725
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006726 result = PyUnicode_FromUnicode(result_buf, slicelength);
6727 PyMem_FREE(result_buf);
6728 return result;
6729 }
6730 } else {
6731 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6732 return NULL;
6733 }
6734}
6735
6736static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006737 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006738 (binaryfunc)unicode_subscript, /* mp_subscript */
6739 (objobjargproc)0, /* mp_ass_subscript */
6740};
6741
Martin v. Löwis18e16552006-02-15 17:27:45 +00006742static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006744 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745 const void **ptr)
6746{
6747 if (index != 0) {
6748 PyErr_SetString(PyExc_SystemError,
6749 "accessing non-existent unicode segment");
6750 return -1;
6751 }
6752 *ptr = (void *) self->str;
6753 return PyUnicode_GET_DATA_SIZE(self);
6754}
6755
Martin v. Löwis18e16552006-02-15 17:27:45 +00006756static Py_ssize_t
6757unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758 const void **ptr)
6759{
6760 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006761 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762 return -1;
6763}
6764
6765static int
6766unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006767 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768{
6769 if (lenp)
6770 *lenp = PyUnicode_GET_DATA_SIZE(self);
6771 return 1;
6772}
6773
Martin v. Löwiseb079f12006-02-16 14:32:27 +00006774static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006776 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 const void **ptr)
6778{
6779 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006780
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781 if (index != 0) {
6782 PyErr_SetString(PyExc_SystemError,
6783 "accessing non-existent unicode segment");
6784 return -1;
6785 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006786 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787 if (str == NULL)
6788 return -1;
6789 *ptr = (void *) PyString_AS_STRING(str);
6790 return PyString_GET_SIZE(str);
6791}
6792
6793/* Helpers for PyUnicode_Format() */
6794
6795static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006796getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006798 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799 if (argidx < arglen) {
6800 (*p_argidx)++;
6801 if (arglen < 0)
6802 return args;
6803 else
6804 return PyTuple_GetItem(args, argidx);
6805 }
6806 PyErr_SetString(PyExc_TypeError,
6807 "not enough arguments for format string");
6808 return NULL;
6809}
6810
6811#define F_LJUST (1<<0)
6812#define F_SIGN (1<<1)
6813#define F_BLANK (1<<2)
6814#define F_ALT (1<<3)
6815#define F_ZERO (1<<4)
6816
Martin v. Löwis18e16552006-02-15 17:27:45 +00006817static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00006818strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006820 register Py_ssize_t i;
6821 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822 for (i = len - 1; i >= 0; i--)
6823 buffer[i] = (Py_UNICODE) charbuffer[i];
6824
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825 return len;
6826}
6827
Neal Norwitzfc76d632006-01-10 06:03:13 +00006828static int
6829doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
6830{
Tim Peters15231542006-02-16 01:08:01 +00006831 Py_ssize_t result;
6832
Neal Norwitzfc76d632006-01-10 06:03:13 +00006833 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006834 result = strtounicode(buffer, (char *)buffer);
6835 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006836}
6837
6838static int
6839longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
6840{
Tim Peters15231542006-02-16 01:08:01 +00006841 Py_ssize_t result;
6842
Neal Norwitzfc76d632006-01-10 06:03:13 +00006843 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006844 result = strtounicode(buffer, (char *)buffer);
6845 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006846}
6847
Guido van Rossum078151d2002-08-11 04:24:12 +00006848/* XXX To save some code duplication, formatfloat/long/int could have been
6849 shared with stringobject.c, converting from 8-bit to Unicode after the
6850 formatting is done. */
6851
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852static int
6853formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006854 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855 int flags,
6856 int prec,
6857 int type,
6858 PyObject *v)
6859{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006860 /* fmt = '%#.' + `prec` + `type`
6861 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862 char fmt[20];
6863 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006864
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865 x = PyFloat_AsDouble(v);
6866 if (x == -1.0 && PyErr_Occurred())
6867 return -1;
6868 if (prec < 0)
6869 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6871 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006872 /* Worst case length calc to ensure no buffer overrun:
6873
6874 'g' formats:
6875 fmt = %#.<prec>g
6876 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6877 for any double rep.)
6878 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6879
6880 'f' formats:
6881 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6882 len = 1 + 50 + 1 + prec = 52 + prec
6883
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006884 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006885 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006886
6887 */
6888 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6889 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006890 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006891 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006892 return -1;
6893 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006894 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6895 (flags&F_ALT) ? "#" : "",
6896 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006897 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898}
6899
Tim Peters38fd5b62000-09-21 05:43:11 +00006900static PyObject*
6901formatlong(PyObject *val, int flags, int prec, int type)
6902{
6903 char *buf;
6904 int i, len;
6905 PyObject *str; /* temporary string object. */
6906 PyUnicodeObject *result;
6907
6908 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6909 if (!str)
6910 return NULL;
6911 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006912 if (!result) {
6913 Py_DECREF(str);
6914 return NULL;
6915 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006916 for (i = 0; i < len; i++)
6917 result->str[i] = buf[i];
6918 result->str[len] = 0;
6919 Py_DECREF(str);
6920 return (PyObject*)result;
6921}
6922
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923static int
6924formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006925 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 int flags,
6927 int prec,
6928 int type,
6929 PyObject *v)
6930{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006931 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006932 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6933 * + 1 + 1
6934 * = 24
6935 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006936 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006937 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 long x;
6939
6940 x = PyInt_AsLong(v);
6941 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006942 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006943 if (x < 0 && type == 'u') {
6944 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006945 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006946 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6947 sign = "-";
6948 else
6949 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006951 prec = 1;
6952
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006953 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6954 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006955 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006956 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006957 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006958 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006959 return -1;
6960 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006961
6962 if ((flags & F_ALT) &&
6963 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006964 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006965 * of issues that cause pain:
6966 * - when 0 is being converted, the C standard leaves off
6967 * the '0x' or '0X', which is inconsistent with other
6968 * %#x/%#X conversions and inconsistent with Python's
6969 * hex() function
6970 * - there are platforms that violate the standard and
6971 * convert 0 with the '0x' or '0X'
6972 * (Metrowerks, Compaq Tru64)
6973 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006974 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006975 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006976 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006977 * We can achieve the desired consistency by inserting our
6978 * own '0x' or '0X' prefix, and substituting %x/%X in place
6979 * of %#x/%#X.
6980 *
6981 * Note that this is the same approach as used in
6982 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006983 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006984 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6985 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006986 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006987 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006988 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6989 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006990 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006991 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006992 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00006993 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006994 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00006995 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996}
6997
6998static int
6999formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007000 size_t buflen,
7001 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007003 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007004 if (PyUnicode_Check(v)) {
7005 if (PyUnicode_GET_SIZE(v) != 1)
7006 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007008 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007010 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007011 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007012 goto onError;
7013 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7014 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015
7016 else {
7017 /* Integer input truncated to a character */
7018 long x;
7019 x = PyInt_AsLong(v);
7020 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007021 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007022#ifdef Py_UNICODE_WIDE
7023 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007024 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007025 "%c arg not in range(0x110000) "
7026 "(wide Python build)");
7027 return -1;
7028 }
7029#else
7030 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007031 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007032 "%c arg not in range(0x10000) "
7033 "(narrow Python build)");
7034 return -1;
7035 }
7036#endif
7037 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 }
7039 buf[1] = '\0';
7040 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007041
7042 onError:
7043 PyErr_SetString(PyExc_TypeError,
7044 "%c requires int or char");
7045 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046}
7047
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007048/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7049
7050 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7051 chars are formatted. XXX This is a magic number. Each formatting
7052 routine does bounds checking to ensure no overflow, but a better
7053 solution may be to malloc a buffer of appropriate size for each
7054 format. For now, the current solution is sufficient.
7055*/
7056#define FORMATBUFLEN (size_t)120
7057
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058PyObject *PyUnicode_Format(PyObject *format,
7059 PyObject *args)
7060{
7061 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007062 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063 int args_owned = 0;
7064 PyUnicodeObject *result = NULL;
7065 PyObject *dict = NULL;
7066 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007067
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068 if (format == NULL || args == NULL) {
7069 PyErr_BadInternalCall();
7070 return NULL;
7071 }
7072 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007073 if (uformat == NULL)
7074 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075 fmt = PyUnicode_AS_UNICODE(uformat);
7076 fmtcnt = PyUnicode_GET_SIZE(uformat);
7077
7078 reslen = rescnt = fmtcnt + 100;
7079 result = _PyUnicode_New(reslen);
7080 if (result == NULL)
7081 goto onError;
7082 res = PyUnicode_AS_UNICODE(result);
7083
7084 if (PyTuple_Check(args)) {
7085 arglen = PyTuple_Size(args);
7086 argidx = 0;
7087 }
7088 else {
7089 arglen = -1;
7090 argidx = -2;
7091 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007092 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7093 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094 dict = args;
7095
7096 while (--fmtcnt >= 0) {
7097 if (*fmt != '%') {
7098 if (--rescnt < 0) {
7099 rescnt = fmtcnt + 100;
7100 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007101 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007102 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7104 --rescnt;
7105 }
7106 *res++ = *fmt++;
7107 }
7108 else {
7109 /* Got a format specifier */
7110 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007111 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113 Py_UNICODE c = '\0';
7114 Py_UNICODE fill;
7115 PyObject *v = NULL;
7116 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007117 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007119 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007120 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121
7122 fmt++;
7123 if (*fmt == '(') {
7124 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007125 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126 PyObject *key;
7127 int pcount = 1;
7128
7129 if (dict == NULL) {
7130 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007131 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132 goto onError;
7133 }
7134 ++fmt;
7135 --fmtcnt;
7136 keystart = fmt;
7137 /* Skip over balanced parentheses */
7138 while (pcount > 0 && --fmtcnt >= 0) {
7139 if (*fmt == ')')
7140 --pcount;
7141 else if (*fmt == '(')
7142 ++pcount;
7143 fmt++;
7144 }
7145 keylen = fmt - keystart - 1;
7146 if (fmtcnt < 0 || pcount > 0) {
7147 PyErr_SetString(PyExc_ValueError,
7148 "incomplete format key");
7149 goto onError;
7150 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007151#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007152 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153 then looked up since Python uses strings to hold
7154 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007155 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156 key = PyUnicode_EncodeUTF8(keystart,
7157 keylen,
7158 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007159#else
7160 key = PyUnicode_FromUnicode(keystart, keylen);
7161#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162 if (key == NULL)
7163 goto onError;
7164 if (args_owned) {
7165 Py_DECREF(args);
7166 args_owned = 0;
7167 }
7168 args = PyObject_GetItem(dict, key);
7169 Py_DECREF(key);
7170 if (args == NULL) {
7171 goto onError;
7172 }
7173 args_owned = 1;
7174 arglen = -1;
7175 argidx = -2;
7176 }
7177 while (--fmtcnt >= 0) {
7178 switch (c = *fmt++) {
7179 case '-': flags |= F_LJUST; continue;
7180 case '+': flags |= F_SIGN; continue;
7181 case ' ': flags |= F_BLANK; continue;
7182 case '#': flags |= F_ALT; continue;
7183 case '0': flags |= F_ZERO; continue;
7184 }
7185 break;
7186 }
7187 if (c == '*') {
7188 v = getnextarg(args, arglen, &argidx);
7189 if (v == NULL)
7190 goto onError;
7191 if (!PyInt_Check(v)) {
7192 PyErr_SetString(PyExc_TypeError,
7193 "* wants int");
7194 goto onError;
7195 }
7196 width = PyInt_AsLong(v);
7197 if (width < 0) {
7198 flags |= F_LJUST;
7199 width = -width;
7200 }
7201 if (--fmtcnt >= 0)
7202 c = *fmt++;
7203 }
7204 else if (c >= '0' && c <= '9') {
7205 width = c - '0';
7206 while (--fmtcnt >= 0) {
7207 c = *fmt++;
7208 if (c < '0' || c > '9')
7209 break;
7210 if ((width*10) / 10 != width) {
7211 PyErr_SetString(PyExc_ValueError,
7212 "width too big");
7213 goto onError;
7214 }
7215 width = width*10 + (c - '0');
7216 }
7217 }
7218 if (c == '.') {
7219 prec = 0;
7220 if (--fmtcnt >= 0)
7221 c = *fmt++;
7222 if (c == '*') {
7223 v = getnextarg(args, arglen, &argidx);
7224 if (v == NULL)
7225 goto onError;
7226 if (!PyInt_Check(v)) {
7227 PyErr_SetString(PyExc_TypeError,
7228 "* wants int");
7229 goto onError;
7230 }
7231 prec = PyInt_AsLong(v);
7232 if (prec < 0)
7233 prec = 0;
7234 if (--fmtcnt >= 0)
7235 c = *fmt++;
7236 }
7237 else if (c >= '0' && c <= '9') {
7238 prec = c - '0';
7239 while (--fmtcnt >= 0) {
7240 c = Py_CHARMASK(*fmt++);
7241 if (c < '0' || c > '9')
7242 break;
7243 if ((prec*10) / 10 != prec) {
7244 PyErr_SetString(PyExc_ValueError,
7245 "prec too big");
7246 goto onError;
7247 }
7248 prec = prec*10 + (c - '0');
7249 }
7250 }
7251 } /* prec */
7252 if (fmtcnt >= 0) {
7253 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254 if (--fmtcnt >= 0)
7255 c = *fmt++;
7256 }
7257 }
7258 if (fmtcnt < 0) {
7259 PyErr_SetString(PyExc_ValueError,
7260 "incomplete format");
7261 goto onError;
7262 }
7263 if (c != '%') {
7264 v = getnextarg(args, arglen, &argidx);
7265 if (v == NULL)
7266 goto onError;
7267 }
7268 sign = 0;
7269 fill = ' ';
7270 switch (c) {
7271
7272 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007273 pbuf = formatbuf;
7274 /* presume that buffer length is at least 1 */
7275 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276 len = 1;
7277 break;
7278
7279 case 's':
7280 case 'r':
7281 if (PyUnicode_Check(v) && c == 's') {
7282 temp = v;
7283 Py_INCREF(temp);
7284 }
7285 else {
7286 PyObject *unicode;
7287 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007288 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289 else
7290 temp = PyObject_Repr(v);
7291 if (temp == NULL)
7292 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007293 if (PyUnicode_Check(temp))
7294 /* nothing to do */;
7295 else if (PyString_Check(temp)) {
7296 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007297 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007299 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007301 Py_DECREF(temp);
7302 temp = unicode;
7303 if (temp == NULL)
7304 goto onError;
7305 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007306 else {
7307 Py_DECREF(temp);
7308 PyErr_SetString(PyExc_TypeError,
7309 "%s argument has non-string str()");
7310 goto onError;
7311 }
7312 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007313 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314 len = PyUnicode_GET_SIZE(temp);
7315 if (prec >= 0 && len > prec)
7316 len = prec;
7317 break;
7318
7319 case 'i':
7320 case 'd':
7321 case 'u':
7322 case 'o':
7323 case 'x':
7324 case 'X':
7325 if (c == 'i')
7326 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007327 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007328 temp = formatlong(v, flags, prec, c);
7329 if (!temp)
7330 goto onError;
7331 pbuf = PyUnicode_AS_UNICODE(temp);
7332 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007333 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007335 else {
7336 pbuf = formatbuf;
7337 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7338 flags, prec, c, v);
7339 if (len < 0)
7340 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007341 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007342 }
7343 if (flags & F_ZERO)
7344 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345 break;
7346
7347 case 'e':
7348 case 'E':
7349 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007350 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351 case 'g':
7352 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007353 if (c == 'F')
7354 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007355 pbuf = formatbuf;
7356 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7357 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358 if (len < 0)
7359 goto onError;
7360 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007361 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362 fill = '0';
7363 break;
7364
7365 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007366 pbuf = formatbuf;
7367 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368 if (len < 0)
7369 goto onError;
7370 break;
7371
7372 default:
7373 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007374 "unsupported format character '%c' (0x%x) "
7375 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007376 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007377 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007378 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379 goto onError;
7380 }
7381 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007382 if (*pbuf == '-' || *pbuf == '+') {
7383 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384 len--;
7385 }
7386 else if (flags & F_SIGN)
7387 sign = '+';
7388 else if (flags & F_BLANK)
7389 sign = ' ';
7390 else
7391 sign = 0;
7392 }
7393 if (width < len)
7394 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007395 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396 reslen -= rescnt;
7397 rescnt = width + fmtcnt + 100;
7398 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007399 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007400 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007401 PyErr_NoMemory();
7402 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007403 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007404 if (_PyUnicode_Resize(&result, reslen) < 0) {
7405 Py_XDECREF(temp);
7406 goto onError;
7407 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408 res = PyUnicode_AS_UNICODE(result)
7409 + reslen - rescnt;
7410 }
7411 if (sign) {
7412 if (fill != ' ')
7413 *res++ = sign;
7414 rescnt--;
7415 if (width > len)
7416 width--;
7417 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007418 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7419 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007420 assert(pbuf[1] == c);
7421 if (fill != ' ') {
7422 *res++ = *pbuf++;
7423 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007424 }
Tim Petersfff53252001-04-12 18:38:48 +00007425 rescnt -= 2;
7426 width -= 2;
7427 if (width < 0)
7428 width = 0;
7429 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007430 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431 if (width > len && !(flags & F_LJUST)) {
7432 do {
7433 --rescnt;
7434 *res++ = fill;
7435 } while (--width > len);
7436 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007437 if (fill == ' ') {
7438 if (sign)
7439 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007440 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007441 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007442 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007443 *res++ = *pbuf++;
7444 *res++ = *pbuf++;
7445 }
7446 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007447 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448 res += len;
7449 rescnt -= len;
7450 while (--width >= len) {
7451 --rescnt;
7452 *res++ = ' ';
7453 }
7454 if (dict && (argidx < arglen) && c != '%') {
7455 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007456 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007457 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458 goto onError;
7459 }
7460 Py_XDECREF(temp);
7461 } /* '%' */
7462 } /* until end */
7463 if (argidx < arglen && !dict) {
7464 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007465 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466 goto onError;
7467 }
7468
Thomas Woutersa96affe2006-03-12 00:29:36 +00007469 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7470 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471 if (args_owned) {
7472 Py_DECREF(args);
7473 }
7474 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475 return (PyObject *)result;
7476
7477 onError:
7478 Py_XDECREF(result);
7479 Py_DECREF(uformat);
7480 if (args_owned) {
7481 Py_DECREF(args);
7482 }
7483 return NULL;
7484}
7485
7486static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007487 (readbufferproc) unicode_buffer_getreadbuf,
7488 (writebufferproc) unicode_buffer_getwritebuf,
7489 (segcountproc) unicode_buffer_getsegcount,
7490 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491};
7492
Jeremy Hylton938ace62002-07-17 16:30:39 +00007493static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007494unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7495
Tim Peters6d6c1a32001-08-02 04:15:00 +00007496static PyObject *
7497unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7498{
7499 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007500 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007501 char *encoding = NULL;
7502 char *errors = NULL;
7503
Guido van Rossume023fe02001-08-30 03:12:59 +00007504 if (type != &PyUnicode_Type)
7505 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007506 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7507 kwlist, &x, &encoding, &errors))
7508 return NULL;
7509 if (x == NULL)
7510 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007511 if (encoding == NULL && errors == NULL)
7512 return PyObject_Unicode(x);
7513 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007514 return PyUnicode_FromEncodedObject(x, encoding, errors);
7515}
7516
Guido van Rossume023fe02001-08-30 03:12:59 +00007517static PyObject *
7518unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7519{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007520 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007521 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007522
7523 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7524 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7525 if (tmp == NULL)
7526 return NULL;
7527 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007528 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007529 if (pnew == NULL) {
7530 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007531 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007532 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007533 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7534 if (pnew->str == NULL) {
7535 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007536 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007537 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007538 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007539 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007540 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7541 pnew->length = n;
7542 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007543 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007544 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007545}
7546
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007547PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007548"unicode(string [, encoding[, errors]]) -> object\n\
7549\n\
7550Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007551encoding defaults to the current default string encoding.\n\
7552errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007553
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554PyTypeObject PyUnicode_Type = {
7555 PyObject_HEAD_INIT(&PyType_Type)
7556 0, /* ob_size */
7557 "unicode", /* tp_name */
7558 sizeof(PyUnicodeObject), /* tp_size */
7559 0, /* tp_itemsize */
7560 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007561 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007563 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564 0, /* tp_setattr */
7565 (cmpfunc) unicode_compare, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007566 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007567 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007569 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570 (hashfunc) unicode_hash, /* tp_hash*/
7571 0, /* tp_call*/
7572 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007573 PyObject_GenericGetAttr, /* tp_getattro */
7574 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007576 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7577 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007578 unicode_doc, /* tp_doc */
7579 0, /* tp_traverse */
7580 0, /* tp_clear */
7581 0, /* tp_richcompare */
7582 0, /* tp_weaklistoffset */
7583 0, /* tp_iter */
7584 0, /* tp_iternext */
7585 unicode_methods, /* tp_methods */
7586 0, /* tp_members */
7587 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007588 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007589 0, /* tp_dict */
7590 0, /* tp_descr_get */
7591 0, /* tp_descr_set */
7592 0, /* tp_dictoffset */
7593 0, /* tp_init */
7594 0, /* tp_alloc */
7595 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007596 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597};
7598
7599/* Initialize the Unicode implementation */
7600
Thomas Wouters78890102000-07-22 19:25:51 +00007601void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007603 int i;
7604
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007605 /* XXX - move this array to unicodectype.c ? */
7606 Py_UNICODE linebreak[] = {
7607 0x000A, /* LINE FEED */
7608 0x000D, /* CARRIAGE RETURN */
7609 0x001C, /* FILE SEPARATOR */
7610 0x001D, /* GROUP SEPARATOR */
7611 0x001E, /* RECORD SEPARATOR */
7612 0x0085, /* NEXT LINE */
7613 0x2028, /* LINE SEPARATOR */
7614 0x2029, /* PARAGRAPH SEPARATOR */
7615 };
7616
Fred Drakee4315f52000-05-09 19:53:39 +00007617 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007618 unicode_freelist = NULL;
7619 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007621 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007622 for (i = 0; i < 256; i++)
7623 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007624 if (PyType_Ready(&PyUnicode_Type) < 0)
7625 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007626
7627 /* initialize the linebreak bloom filter */
7628 bloom_linebreak = make_bloom_mask(
7629 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
7630 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631}
7632
7633/* Finalize the Unicode implementation */
7634
7635void
Thomas Wouters78890102000-07-22 19:25:51 +00007636_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007638 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007639 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007641 Py_XDECREF(unicode_empty);
7642 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007643
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007644 for (i = 0; i < 256; i++) {
7645 if (unicode_latin1[i]) {
7646 Py_DECREF(unicode_latin1[i]);
7647 unicode_latin1[i] = NULL;
7648 }
7649 }
7650
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007651 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652 PyUnicodeObject *v = u;
7653 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007654 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007655 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007656 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007657 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007659 unicode_freelist = NULL;
7660 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007662
Anthony Baxterac6bd462006-04-13 02:06:09 +00007663#ifdef __cplusplus
7664}
7665#endif
7666
7667
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007668/*
7669Local variables:
7670c-basic-offset: 4
7671indent-tabs-mode: nil
7672End:
7673*/