blob: 91d76b7584e2dac6b973304c2674c8c5fb446c3d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
Fredrik Lundh77633512006-05-23 19:47:35 +0000166 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000187
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000199 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000205 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 PyErr_NoMemory();
207 return -1;
208 }
209 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000210 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000212 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 }
218 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000219
Guido van Rossumd57fd912000-03-10 22:53:23 +0000220 return 0;
221}
222
223/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000224 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
228
229*/
230
231static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233{
234 register PyUnicodeObject *unicode;
235
Andrew Dalkee0df7622006-05-27 11:04:36 +0000236 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
240 }
241
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000245 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000250 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000251 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 }
255 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000256 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000258 }
259 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266 }
267
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000268 if (!unicode->str) {
269 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000270 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000271 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000272 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
278 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000283 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000288 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290}
291
292static
Guido van Rossum9475a232001-10-05 20:51:39 +0000293void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000299 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 unicode->str = NULL;
301 unicode->length = 0;
302 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000306 }
307 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000313 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000314 Py_XDECREF(unicode->defenc);
Christian Heimese93237d2007-12-19 02:37:44 +0000315 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317}
318
Martin v. Löwis18e16552006-02-15 17:27:45 +0000319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000320{
321 register PyUnicodeObject *v;
322
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
327 }
328 v = (PyUnicodeObject *)*unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000329 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 PyErr_BadInternalCall();
331 return -1;
332 }
333
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000337 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000344 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000345 *unicode = (PyObject *)w;
346 return 0;
347 }
348
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360{
361 PyUnicodeObject *unicode;
362
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
366
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
371 }
372
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 if (!unicode)
380 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000381 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000382 unicode_latin1[*u] = unicode;
383 }
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
386 }
387 }
Tim Petersced69f82003-09-16 20:30:58 +0000388
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
392
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396
397 return (PyObject *)unicode;
398}
399
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000400PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
401{
402 PyUnicodeObject *unicode;
403 /* If the Unicode data is known at construction time, we can apply
404 some optimizations which share commonly used objects.
405 Also, this means the input must be UTF-8, so fall back to the
406 UTF-8 decoder at the end. */
407 if (u != NULL) {
408
409 /* Optimization for empty strings */
410 if (size == 0 && unicode_empty != NULL) {
411 Py_INCREF(unicode_empty);
412 return (PyObject *)unicode_empty;
413 }
414
415 /* Single characters are shared when using this constructor.
416 Restrict to ASCII, since the input must be UTF-8. */
417 if (size == 1 && Py_CHARMASK(*u) < 128) {
418 unicode = unicode_latin1[Py_CHARMASK(*u)];
419 if (!unicode) {
420 unicode = _PyUnicode_New(1);
421 if (!unicode)
422 return NULL;
423 unicode->str[0] = Py_CHARMASK(*u);
424 unicode_latin1[Py_CHARMASK(*u)] = unicode;
425 }
426 Py_INCREF(unicode);
427 return (PyObject *)unicode;
428 }
429
430 return PyUnicode_DecodeUTF8(u, size, NULL);
431 }
432
433 unicode = _PyUnicode_New(size);
434 if (!unicode)
435 return NULL;
436
437 return (PyObject *)unicode;
438}
439
440PyObject *PyUnicode_FromString(const char *u)
441{
442 size_t size = strlen(u);
443 if (size > PY_SSIZE_T_MAX) {
444 PyErr_SetString(PyExc_OverflowError, "input too long");
445 return NULL;
446 }
447
448 return PyUnicode_FromStringAndSize(u, size);
449}
450
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451#ifdef HAVE_WCHAR_H
452
453PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000454 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455{
456 PyUnicodeObject *unicode;
457
458 if (w == NULL) {
459 PyErr_BadInternalCall();
460 return NULL;
461 }
462
463 unicode = _PyUnicode_New(size);
464 if (!unicode)
465 return NULL;
466
467 /* Copy the wchar_t data into the new object */
468#ifdef HAVE_USABLE_WCHAR_T
469 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000470#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000471 {
472 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000473 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000475 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000476 *u++ = *w++;
477 }
478#endif
479
480 return (PyObject *)unicode;
481}
482
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000483static void
484makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
485{
486 *fmt++ = '%';
487 if (width) {
488 if (zeropad)
489 *fmt++ = '0';
490 fmt += sprintf(fmt, "%d", width);
491 }
492 if (precision)
493 fmt += sprintf(fmt, ".%d", precision);
494 if (longflag)
495 *fmt++ = 'l';
496 else if (size_tflag) {
497 char *f = PY_FORMAT_SIZE_T;
498 while (*f)
499 *fmt++ = *f++;
500 }
501 *fmt++ = c;
502 *fmt = '\0';
503}
504
505#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
506
507PyObject *
508PyUnicode_FromFormatV(const char *format, va_list vargs)
509{
510 va_list count;
511 Py_ssize_t callcount = 0;
512 PyObject **callresults = NULL;
513 PyObject **callresult = NULL;
514 Py_ssize_t n = 0;
515 int width = 0;
516 int precision = 0;
517 int zeropad;
518 const char* f;
519 Py_UNICODE *s;
520 PyObject *string;
521 /* used by sprintf */
522 char buffer[21];
523 /* use abuffer instead of buffer, if we need more space
524 * (which can happen if there's a format specifier with width). */
525 char *abuffer = NULL;
526 char *realbuffer;
527 Py_ssize_t abuffersize = 0;
528 char fmt[60]; /* should be enough for %0width.precisionld */
529 const char *copy;
530
531#ifdef VA_LIST_IS_ARRAY
532 Py_MEMCPY(count, vargs, sizeof(va_list));
533#else
534#ifdef __va_copy
535 __va_copy(count, vargs);
536#else
537 count = vargs;
538#endif
539#endif
540 /* step 1: count the number of %S/%R format specifications
541 * (we call PyObject_Str()/PyObject_Repr() for these objects
542 * once during step 3 and put the result in an array) */
543 for (f = format; *f; f++) {
544 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
545 ++callcount;
546 }
547 /* step 2: allocate memory for the results of
548 * PyObject_Str()/PyObject_Repr() calls */
549 if (callcount) {
550 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
551 if (!callresults) {
552 PyErr_NoMemory();
553 return NULL;
554 }
555 callresult = callresults;
556 }
557 /* step 3: figure out how large a buffer we need */
558 for (f = format; *f; f++) {
559 if (*f == '%') {
560 const char* p = f;
561 width = 0;
562 while (isdigit(*f))
563 width = (width*10) + *f++ - '0';
564 while (*++f && *f != '%' && !isalpha(*f))
565 ;
566
567 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
568 * they don't affect the amount of space we reserve.
569 */
570 if ((*f == 'l' || *f == 'z') &&
571 (f[1] == 'd' || f[1] == 'u'))
572 ++f;
573
574 switch (*f) {
575 case 'c':
576 (void)va_arg(count, int);
577 /* fall through... */
578 case '%':
579 n++;
580 break;
581 case 'd': case 'u': case 'i': case 'x':
582 (void) va_arg(count, int);
583 /* 20 bytes is enough to hold a 64-bit
584 integer. Decimal takes the most space.
585 This isn't enough for octal.
586 If a width is specified we need more
587 (which we allocate later). */
588 if (width < 20)
589 width = 20;
590 n += width;
591 if (abuffersize < width)
592 abuffersize = width;
593 break;
594 case 's':
595 {
596 /* UTF-8 */
597 unsigned char*s;
598 s = va_arg(count, unsigned char*);
599 while (*s) {
600 if (*s < 128) {
601 n++; s++;
602 } else if (*s < 0xc0) {
603 /* invalid UTF-8 */
604 n++; s++;
605 } else if (*s < 0xc0) {
606 n++;
607 s++; if(!*s)break;
608 s++;
609 } else if (*s < 0xe0) {
610 n++;
611 s++; if(!*s)break;
612 s++; if(!*s)break;
613 s++;
614 } else {
615 #ifdef Py_UNICODE_WIDE
616 n++;
617 #else
618 n+=2;
619 #endif
620 s++; if(!*s)break;
621 s++; if(!*s)break;
622 s++; if(!*s)break;
623 s++;
624 }
625 }
626 break;
627 }
628 case 'U':
629 {
630 PyObject *obj = va_arg(count, PyObject *);
631 assert(obj && PyUnicode_Check(obj));
632 n += PyUnicode_GET_SIZE(obj);
633 break;
634 }
635 case 'V':
636 {
637 PyObject *obj = va_arg(count, PyObject *);
638 const char *str = va_arg(count, const char *);
639 assert(obj || str);
640 assert(!obj || PyUnicode_Check(obj));
641 if (obj)
642 n += PyUnicode_GET_SIZE(obj);
643 else
644 n += strlen(str);
645 break;
646 }
647 case 'S':
648 {
649 PyObject *obj = va_arg(count, PyObject *);
650 PyObject *str;
651 assert(obj);
652 str = PyObject_Str(obj);
653 if (!str)
654 goto fail;
655 n += PyUnicode_GET_SIZE(str);
656 /* Remember the str and switch to the next slot */
657 *callresult++ = str;
658 break;
659 }
660 case 'R':
661 {
662 PyObject *obj = va_arg(count, PyObject *);
663 PyObject *repr;
664 assert(obj);
665 repr = PyObject_Repr(obj);
666 if (!repr)
667 goto fail;
668 n += PyUnicode_GET_SIZE(repr);
669 /* Remember the repr and switch to the next slot */
670 *callresult++ = repr;
671 break;
672 }
673 case 'p':
674 (void) va_arg(count, int);
675 /* maximum 64-bit pointer representation:
676 * 0xffffffffffffffff
677 * so 19 characters is enough.
678 * XXX I count 18 -- what's the extra for?
679 */
680 n += 19;
681 break;
682 default:
683 /* if we stumble upon an unknown
684 formatting code, copy the rest of
685 the format string to the output
686 string. (we cannot just skip the
687 code, since there's no way to know
688 what's in the argument list) */
689 n += strlen(p);
690 goto expand;
691 }
692 } else
693 n++;
694 }
695 expand:
696 if (abuffersize > 20) {
697 abuffer = PyMem_Malloc(abuffersize);
698 if (!abuffer) {
699 PyErr_NoMemory();
700 goto fail;
701 }
702 realbuffer = abuffer;
703 }
704 else
705 realbuffer = buffer;
706 /* step 4: fill the buffer */
707 /* Since we've analyzed how much space we need for the worst case,
708 we don't have to resize the string.
709 There can be no errors beyond this point. */
710 string = PyUnicode_FromUnicode(NULL, n);
711 if (!string)
712 goto fail;
713
714 s = PyUnicode_AS_UNICODE(string);
715 callresult = callresults;
716
717 for (f = format; *f; f++) {
718 if (*f == '%') {
719 const char* p = f++;
720 int longflag = 0;
721 int size_tflag = 0;
722 zeropad = (*f == '0');
723 /* parse the width.precision part */
724 width = 0;
725 while (isdigit(*f))
726 width = (width*10) + *f++ - '0';
727 precision = 0;
728 if (*f == '.') {
729 f++;
730 while (isdigit(*f))
731 precision = (precision*10) + *f++ - '0';
732 }
733 /* handle the long flag, but only for %ld and %lu.
734 others can be added when necessary. */
735 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
736 longflag = 1;
737 ++f;
738 }
739 /* handle the size_t flag. */
740 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
741 size_tflag = 1;
742 ++f;
743 }
744
745 switch (*f) {
746 case 'c':
747 *s++ = va_arg(vargs, int);
748 break;
749 case 'd':
750 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
751 if (longflag)
752 sprintf(realbuffer, fmt, va_arg(vargs, long));
753 else if (size_tflag)
754 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
755 else
756 sprintf(realbuffer, fmt, va_arg(vargs, int));
757 appendstring(realbuffer);
758 break;
759 case 'u':
760 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
761 if (longflag)
762 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
763 else if (size_tflag)
764 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
765 else
766 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
767 appendstring(realbuffer);
768 break;
769 case 'i':
770 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
771 sprintf(realbuffer, fmt, va_arg(vargs, int));
772 appendstring(realbuffer);
773 break;
774 case 'x':
775 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
776 sprintf(realbuffer, fmt, va_arg(vargs, int));
777 appendstring(realbuffer);
778 break;
779 case 's':
780 {
781 /* Parameter must be UTF-8 encoded.
782 In case of encoding errors, use
783 the replacement character. */
784 PyObject *u;
785 p = va_arg(vargs, char*);
786 u = PyUnicode_DecodeUTF8(p, strlen(p),
787 "replace");
788 if (!u)
789 goto fail;
790 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
791 PyUnicode_GET_SIZE(u));
792 s += PyUnicode_GET_SIZE(u);
793 Py_DECREF(u);
794 break;
795 }
796 case 'U':
797 {
798 PyObject *obj = va_arg(vargs, PyObject *);
799 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
800 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
801 s += size;
802 break;
803 }
804 case 'V':
805 {
806 PyObject *obj = va_arg(vargs, PyObject *);
807 const char *str = va_arg(vargs, const char *);
808 if (obj) {
809 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
810 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
811 s += size;
812 } else {
813 appendstring(str);
814 }
815 break;
816 }
817 case 'S':
818 case 'R':
819 {
820 Py_UNICODE *ucopy;
821 Py_ssize_t usize;
822 Py_ssize_t upos;
823 /* unused, since we already have the result */
824 (void) va_arg(vargs, PyObject *);
825 ucopy = PyUnicode_AS_UNICODE(*callresult);
826 usize = PyUnicode_GET_SIZE(*callresult);
827 for (upos = 0; upos<usize;)
828 *s++ = ucopy[upos++];
829 /* We're done with the unicode()/repr() => forget it */
830 Py_DECREF(*callresult);
831 /* switch to next unicode()/repr() result */
832 ++callresult;
833 break;
834 }
835 case 'p':
836 sprintf(buffer, "%p", va_arg(vargs, void*));
837 /* %p is ill-defined: ensure leading 0x. */
838 if (buffer[1] == 'X')
839 buffer[1] = 'x';
840 else if (buffer[1] != 'x') {
841 memmove(buffer+2, buffer, strlen(buffer)+1);
842 buffer[0] = '0';
843 buffer[1] = 'x';
844 }
845 appendstring(buffer);
846 break;
847 case '%':
848 *s++ = '%';
849 break;
850 default:
851 appendstring(p);
852 goto end;
853 }
854 } else
855 *s++ = *f;
856 }
857
858 end:
859 if (callresults)
860 PyMem_Free(callresults);
861 if (abuffer)
862 PyMem_Free(abuffer);
863 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
864 return string;
865 fail:
866 if (callresults) {
867 PyObject **callresult2 = callresults;
868 while (callresult2 < callresult) {
869 Py_DECREF(*callresult2);
870 ++callresult2;
871 }
872 PyMem_Free(callresults);
873 }
874 if (abuffer)
875 PyMem_Free(abuffer);
876 return NULL;
877}
878
879#undef appendstring
880
881PyObject *
882PyUnicode_FromFormat(const char *format, ...)
883{
884 PyObject* ret;
885 va_list vargs;
886
887#ifdef HAVE_STDARG_PROTOTYPES
888 va_start(vargs, format);
889#else
890 va_start(vargs);
891#endif
892 ret = PyUnicode_FromFormatV(format, vargs);
893 va_end(vargs);
894 return ret;
895}
896
Martin v. Löwis18e16552006-02-15 17:27:45 +0000897Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
898 wchar_t *w,
899 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000900{
901 if (unicode == NULL) {
902 PyErr_BadInternalCall();
903 return -1;
904 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000905
906 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000907 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000908 size = PyUnicode_GET_SIZE(unicode) + 1;
909
Guido van Rossumd57fd912000-03-10 22:53:23 +0000910#ifdef HAVE_USABLE_WCHAR_T
911 memcpy(w, unicode->str, size * sizeof(wchar_t));
912#else
913 {
914 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000915 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000916 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000917 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000918 *w++ = *u++;
919 }
920#endif
921
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000922 if (size > PyUnicode_GET_SIZE(unicode))
923 return PyUnicode_GET_SIZE(unicode);
924 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000925 return size;
926}
927
928#endif
929
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000930PyObject *PyUnicode_FromOrdinal(int ordinal)
931{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000932 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000933
934#ifdef Py_UNICODE_WIDE
935 if (ordinal < 0 || ordinal > 0x10ffff) {
936 PyErr_SetString(PyExc_ValueError,
937 "unichr() arg not in range(0x110000) "
938 "(wide Python build)");
939 return NULL;
940 }
941#else
942 if (ordinal < 0 || ordinal > 0xffff) {
943 PyErr_SetString(PyExc_ValueError,
944 "unichr() arg not in range(0x10000) "
945 "(narrow Python build)");
946 return NULL;
947 }
948#endif
949
Hye-Shik Chang40574832004-04-06 07:24:51 +0000950 s[0] = (Py_UNICODE)ordinal;
951 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000952}
953
Guido van Rossumd57fd912000-03-10 22:53:23 +0000954PyObject *PyUnicode_FromObject(register PyObject *obj)
955{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000956 /* XXX Perhaps we should make this API an alias of
957 PyObject_Unicode() instead ?! */
958 if (PyUnicode_CheckExact(obj)) {
959 Py_INCREF(obj);
960 return obj;
961 }
962 if (PyUnicode_Check(obj)) {
963 /* For a Unicode subtype that's not a Unicode object,
964 return a true Unicode object with the same data. */
965 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
966 PyUnicode_GET_SIZE(obj));
967 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000968 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
969}
970
971PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
972 const char *encoding,
973 const char *errors)
974{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000975 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000976 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000977 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000978
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979 if (obj == NULL) {
980 PyErr_BadInternalCall();
981 return NULL;
982 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000983
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000984#if 0
985 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000986 that no encodings is given and then redirect to
987 PyObject_Unicode() which then applies the additional logic for
988 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000989
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000990 NOTE: This API should really only be used for object which
991 represent *encoded* Unicode !
992
993 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000994 if (PyUnicode_Check(obj)) {
995 if (encoding) {
996 PyErr_SetString(PyExc_TypeError,
997 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000998 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000999 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001000 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001001 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001002#else
1003 if (PyUnicode_Check(obj)) {
1004 PyErr_SetString(PyExc_TypeError,
1005 "decoding Unicode is not supported");
1006 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001007 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001008#endif
1009
1010 /* Coerce object */
1011 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001012 s = PyString_AS_STRING(obj);
1013 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001014 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001015 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1016 /* Overwrite the error message with something more useful in
1017 case of a TypeError. */
1018 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001019 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001020 "coercing to Unicode: need string or buffer, "
1021 "%.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00001022 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001023 goto onError;
1024 }
Tim Petersced69f82003-09-16 20:30:58 +00001025
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001026 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001027 if (len == 0) {
1028 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001029 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001030 }
Tim Petersced69f82003-09-16 20:30:58 +00001031 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001032 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001033
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001034 return v;
1035
1036 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001037 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038}
1039
1040PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001041 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042 const char *encoding,
1043 const char *errors)
1044{
1045 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001046
1047 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001048 encoding = PyUnicode_GetDefaultEncoding();
1049
1050 /* Shortcuts for common default encodings */
1051 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001053 else if (strcmp(encoding, "latin-1") == 0)
1054 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001055#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1056 else if (strcmp(encoding, "mbcs") == 0)
1057 return PyUnicode_DecodeMBCS(s, size, errors);
1058#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001059 else if (strcmp(encoding, "ascii") == 0)
1060 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001061
1062 /* Decode via the codec registry */
1063 buffer = PyBuffer_FromMemory((void *)s, size);
1064 if (buffer == NULL)
1065 goto onError;
1066 unicode = PyCodec_Decode(buffer, encoding, errors);
1067 if (unicode == NULL)
1068 goto onError;
1069 if (!PyUnicode_Check(unicode)) {
1070 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001071 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001072 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001073 Py_DECREF(unicode);
1074 goto onError;
1075 }
1076 Py_DECREF(buffer);
1077 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001078
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 onError:
1080 Py_XDECREF(buffer);
1081 return NULL;
1082}
1083
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001084PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1085 const char *encoding,
1086 const char *errors)
1087{
1088 PyObject *v;
1089
1090 if (!PyUnicode_Check(unicode)) {
1091 PyErr_BadArgument();
1092 goto onError;
1093 }
1094
1095 if (encoding == NULL)
1096 encoding = PyUnicode_GetDefaultEncoding();
1097
1098 /* Decode via the codec registry */
1099 v = PyCodec_Decode(unicode, encoding, errors);
1100 if (v == NULL)
1101 goto onError;
1102 return v;
1103
1104 onError:
1105 return NULL;
1106}
1107
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001109 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 const char *encoding,
1111 const char *errors)
1112{
1113 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001114
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 unicode = PyUnicode_FromUnicode(s, size);
1116 if (unicode == NULL)
1117 return NULL;
1118 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1119 Py_DECREF(unicode);
1120 return v;
1121}
1122
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001123PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1124 const char *encoding,
1125 const char *errors)
1126{
1127 PyObject *v;
1128
1129 if (!PyUnicode_Check(unicode)) {
1130 PyErr_BadArgument();
1131 goto onError;
1132 }
1133
1134 if (encoding == NULL)
1135 encoding = PyUnicode_GetDefaultEncoding();
1136
1137 /* Encode via the codec registry */
1138 v = PyCodec_Encode(unicode, encoding, errors);
1139 if (v == NULL)
1140 goto onError;
1141 return v;
1142
1143 onError:
1144 return NULL;
1145}
1146
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1148 const char *encoding,
1149 const char *errors)
1150{
1151 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001152
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153 if (!PyUnicode_Check(unicode)) {
1154 PyErr_BadArgument();
1155 goto onError;
1156 }
Fred Drakee4315f52000-05-09 19:53:39 +00001157
Tim Petersced69f82003-09-16 20:30:58 +00001158 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001159 encoding = PyUnicode_GetDefaultEncoding();
1160
1161 /* Shortcuts for common default encodings */
1162 if (errors == NULL) {
1163 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001164 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001165 else if (strcmp(encoding, "latin-1") == 0)
1166 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001167#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1168 else if (strcmp(encoding, "mbcs") == 0)
1169 return PyUnicode_AsMBCSString(unicode);
1170#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001171 else if (strcmp(encoding, "ascii") == 0)
1172 return PyUnicode_AsASCIIString(unicode);
1173 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174
1175 /* Encode via the codec registry */
1176 v = PyCodec_Encode(unicode, encoding, errors);
1177 if (v == NULL)
1178 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 if (!PyString_Check(v)) {
1180 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001181 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001182 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183 Py_DECREF(v);
1184 goto onError;
1185 }
1186 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001187
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188 onError:
1189 return NULL;
1190}
1191
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001192PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1193 const char *errors)
1194{
1195 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1196
1197 if (v)
1198 return v;
1199 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1200 if (v && errors == NULL)
1201 ((PyUnicodeObject *)unicode)->defenc = v;
1202 return v;
1203}
1204
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1206{
1207 if (!PyUnicode_Check(unicode)) {
1208 PyErr_BadArgument();
1209 goto onError;
1210 }
1211 return PyUnicode_AS_UNICODE(unicode);
1212
1213 onError:
1214 return NULL;
1215}
1216
Martin v. Löwis18e16552006-02-15 17:27:45 +00001217Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218{
1219 if (!PyUnicode_Check(unicode)) {
1220 PyErr_BadArgument();
1221 goto onError;
1222 }
1223 return PyUnicode_GET_SIZE(unicode);
1224
1225 onError:
1226 return -1;
1227}
1228
Thomas Wouters78890102000-07-22 19:25:51 +00001229const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001230{
1231 return unicode_default_encoding;
1232}
1233
1234int PyUnicode_SetDefaultEncoding(const char *encoding)
1235{
1236 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001237
Fred Drakee4315f52000-05-09 19:53:39 +00001238 /* Make sure the encoding is valid. As side effect, this also
1239 loads the encoding into the codec registry cache. */
1240 v = _PyCodec_Lookup(encoding);
1241 if (v == NULL)
1242 goto onError;
1243 Py_DECREF(v);
1244 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +00001245 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +00001246 sizeof(unicode_default_encoding));
1247 return 0;
1248
1249 onError:
1250 return -1;
1251}
1252
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001253/* error handling callback helper:
1254 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001255 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001256 and adjust various state variables.
1257 return 0 on success, -1 on error
1258*/
1259
1260static
1261int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1262 const char *encoding, const char *reason,
Walter Dörwald87578782007-08-30 15:30:09 +00001263 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1264 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001265 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001266{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001267 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001268
1269 PyObject *restuple = NULL;
1270 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001271 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1272 Py_ssize_t requiredsize;
1273 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001274 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001275 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001276 int res = -1;
1277
1278 if (*errorHandler == NULL) {
1279 *errorHandler = PyCodec_LookupError(errors);
1280 if (*errorHandler == NULL)
1281 goto onError;
1282 }
1283
1284 if (*exceptionObject == NULL) {
1285 *exceptionObject = PyUnicodeDecodeError_Create(
1286 encoding, input, insize, *startinpos, *endinpos, reason);
1287 if (*exceptionObject == NULL)
1288 goto onError;
1289 }
1290 else {
1291 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1292 goto onError;
1293 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1294 goto onError;
1295 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1296 goto onError;
1297 }
1298
1299 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1300 if (restuple == NULL)
1301 goto onError;
1302 if (!PyTuple_Check(restuple)) {
1303 PyErr_Format(PyExc_TypeError, &argparse[4]);
1304 goto onError;
1305 }
1306 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1307 goto onError;
1308 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001309 newpos = insize+newpos;
1310 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001311 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001312 goto onError;
1313 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001314
1315 /* need more space? (at least enough for what we
1316 have+the replacement+the rest of the string (starting
1317 at the new input position), so we won't have to check space
1318 when there are no errors in the rest of the string) */
1319 repptr = PyUnicode_AS_UNICODE(repunicode);
1320 repsize = PyUnicode_GET_SIZE(repunicode);
1321 requiredsize = *outpos + repsize + insize-newpos;
1322 if (requiredsize > outsize) {
1323 if (requiredsize<2*outsize)
1324 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001325 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 goto onError;
1327 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1328 }
1329 *endinpos = newpos;
1330 *inptr = input + newpos;
1331 Py_UNICODE_COPY(*outptr, repptr, repsize);
1332 *outptr += repsize;
1333 *outpos += repsize;
1334 /* we made it! */
1335 res = 0;
1336
1337 onError:
1338 Py_XDECREF(restuple);
1339 return res;
1340}
1341
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001342/* --- UTF-7 Codec -------------------------------------------------------- */
1343
1344/* see RFC2152 for details */
1345
Tim Petersced69f82003-09-16 20:30:58 +00001346static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001347char utf7_special[128] = {
1348 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1349 encoded:
1350 0 - not special
1351 1 - special
1352 2 - whitespace (optional)
1353 3 - RFC2152 Set O (optional) */
1354 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1355 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1356 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1357 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1358 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1359 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1360 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1361 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1362
1363};
1364
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001365/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1366 warnings about the comparison always being false; since
1367 utf7_special[0] is 1, we can safely make that one comparison
1368 true */
1369
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001370#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001371 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001372 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001373 (encodeO && (utf7_special[(c)] == 3)))
1374
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001375#define B64(n) \
1376 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1377#define B64CHAR(c) \
1378 (isalnum(c) || (c) == '+' || (c) == '/')
1379#define UB64(c) \
1380 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1381 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001382
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001383#define ENCODE(out, ch, bits) \
1384 while (bits >= 6) { \
1385 *out++ = B64(ch >> (bits-6)); \
1386 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001387 }
1388
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001389#define DECODE(out, ch, bits, surrogate) \
1390 while (bits >= 16) { \
1391 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1392 bits -= 16; \
1393 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001394 /* We have already generated an error for the high surrogate \
1395 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001396 surrogate = 0; \
1397 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001398 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001399 it in a 16-bit character */ \
1400 surrogate = 1; \
1401 errmsg = "code pairs are not supported"; \
1402 goto utf7Error; \
1403 } else { \
1404 *out++ = outCh; \
1405 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001406 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001407
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001408PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001409 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001410 const char *errors)
1411{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001412 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1413}
1414
1415PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1416 Py_ssize_t size,
1417 const char *errors,
1418 Py_ssize_t *consumed)
1419{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001421 Py_ssize_t startinpos;
1422 Py_ssize_t endinpos;
1423 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001424 const char *e;
1425 PyUnicodeObject *unicode;
1426 Py_UNICODE *p;
1427 const char *errmsg = "";
1428 int inShift = 0;
1429 unsigned int bitsleft = 0;
1430 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001431 int surrogate = 0;
1432 PyObject *errorHandler = NULL;
1433 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001434
1435 unicode = _PyUnicode_New(size);
1436 if (!unicode)
1437 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001438 if (size == 0) {
1439 if (consumed)
1440 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001441 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001442 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001443
1444 p = unicode->str;
1445 e = s + size;
1446
1447 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001448 Py_UNICODE ch;
1449 restart:
1450 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001451
1452 if (inShift) {
1453 if ((ch == '-') || !B64CHAR(ch)) {
1454 inShift = 0;
1455 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001456
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001457 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1458 if (bitsleft >= 6) {
1459 /* The shift sequence has a partial character in it. If
1460 bitsleft < 6 then we could just classify it as padding
1461 but that is not the case here */
1462
1463 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001464 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001465 }
1466 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001467 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001468 here so indicate the potential of a misencoded character. */
1469
1470 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1471 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1472 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001473 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001474 }
1475
1476 if (ch == '-') {
1477 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001478 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001479 inShift = 1;
1480 }
1481 } else if (SPECIAL(ch,0,0)) {
1482 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001483 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001484 } else {
1485 *p++ = ch;
1486 }
1487 } else {
1488 charsleft = (charsleft << 6) | UB64(ch);
1489 bitsleft += 6;
1490 s++;
1491 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1492 }
1493 }
1494 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001495 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001496 s++;
1497 if (s < e && *s == '-') {
1498 s++;
1499 *p++ = '+';
1500 } else
1501 {
1502 inShift = 1;
1503 bitsleft = 0;
1504 }
1505 }
1506 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001507 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001508 errmsg = "unexpected special character";
1509 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001510 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001511 }
1512 else {
1513 *p++ = ch;
1514 s++;
1515 }
1516 continue;
1517 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001518 outpos = p-PyUnicode_AS_UNICODE(unicode);
1519 endinpos = s-starts;
1520 if (unicode_decode_call_errorhandler(
1521 errors, &errorHandler,
1522 "utf7", errmsg,
1523 starts, size, &startinpos, &endinpos, &exc, &s,
1524 (PyObject **)&unicode, &outpos, &p))
1525 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001526 }
1527
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001528 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001529 outpos = p-PyUnicode_AS_UNICODE(unicode);
1530 endinpos = size;
1531 if (unicode_decode_call_errorhandler(
1532 errors, &errorHandler,
1533 "utf7", "unterminated shift sequence",
1534 starts, size, &startinpos, &endinpos, &exc, &s,
1535 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001536 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001537 if (s < e)
1538 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001539 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001540 if (consumed) {
1541 if(inShift)
1542 *consumed = startinpos;
1543 else
1544 *consumed = s-starts;
1545 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001546
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001547 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001548 goto onError;
1549
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001550 Py_XDECREF(errorHandler);
1551 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001552 return (PyObject *)unicode;
1553
1554onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001555 Py_XDECREF(errorHandler);
1556 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001557 Py_DECREF(unicode);
1558 return NULL;
1559}
1560
1561
1562PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001563 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001564 int encodeSetO,
1565 int encodeWhiteSpace,
1566 const char *errors)
1567{
1568 PyObject *v;
1569 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001570 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001571 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001572 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001573 unsigned int bitsleft = 0;
1574 unsigned long charsleft = 0;
1575 char * out;
1576 char * start;
1577
1578 if (size == 0)
1579 return PyString_FromStringAndSize(NULL, 0);
1580
1581 v = PyString_FromStringAndSize(NULL, cbAllocated);
1582 if (v == NULL)
1583 return NULL;
1584
1585 start = out = PyString_AS_STRING(v);
1586 for (;i < size; ++i) {
1587 Py_UNICODE ch = s[i];
1588
1589 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001590 if (ch == '+') {
1591 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001592 *out++ = '-';
1593 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1594 charsleft = ch;
1595 bitsleft = 16;
1596 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001597 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001598 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001599 } else {
1600 *out++ = (char) ch;
1601 }
1602 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001603 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1604 *out++ = B64(charsleft << (6-bitsleft));
1605 charsleft = 0;
1606 bitsleft = 0;
1607 /* Characters not in the BASE64 set implicitly unshift the sequence
1608 so no '-' is required, except if the character is itself a '-' */
1609 if (B64CHAR(ch) || ch == '-') {
1610 *out++ = '-';
1611 }
1612 inShift = 0;
1613 *out++ = (char) ch;
1614 } else {
1615 bitsleft += 16;
1616 charsleft = (charsleft << 16) | ch;
1617 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1618
1619 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001620 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001621 or '-' then the shift sequence will be terminated implicitly and we
1622 don't have to insert a '-'. */
1623
1624 if (bitsleft == 0) {
1625 if (i + 1 < size) {
1626 Py_UNICODE ch2 = s[i+1];
1627
1628 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001629
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001630 } else if (B64CHAR(ch2) || ch2 == '-') {
1631 *out++ = '-';
1632 inShift = 0;
1633 } else {
1634 inShift = 0;
1635 }
1636
1637 }
1638 else {
1639 *out++ = '-';
1640 inShift = 0;
1641 }
1642 }
Tim Petersced69f82003-09-16 20:30:58 +00001643 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001644 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001645 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001646 if (bitsleft) {
1647 *out++= B64(charsleft << (6-bitsleft) );
1648 *out++ = '-';
1649 }
1650
Tim Peters5de98422002-04-27 18:44:32 +00001651 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652 return v;
1653}
1654
1655#undef SPECIAL
1656#undef B64
1657#undef B64CHAR
1658#undef UB64
1659#undef ENCODE
1660#undef DECODE
1661
Guido van Rossumd57fd912000-03-10 22:53:23 +00001662/* --- UTF-8 Codec -------------------------------------------------------- */
1663
Tim Petersced69f82003-09-16 20:30:58 +00001664static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665char utf8_code_length[256] = {
1666 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1667 illegal prefix. see RFC 2279 for details */
1668 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1669 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1670 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1671 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1672 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1673 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1674 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1675 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1676 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1677 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1678 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1679 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1680 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1681 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1682 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1683 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1684};
1685
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001687 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688 const char *errors)
1689{
Walter Dörwald69652032004-09-07 20:24:22 +00001690 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1691}
1692
1693PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001694 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001695 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001696 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001697{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001698 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001699 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001700 Py_ssize_t startinpos;
1701 Py_ssize_t endinpos;
1702 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703 const char *e;
1704 PyUnicodeObject *unicode;
1705 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001706 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001707 PyObject *errorHandler = NULL;
1708 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001709
1710 /* Note: size will always be longer than the resulting Unicode
1711 character count */
1712 unicode = _PyUnicode_New(size);
1713 if (!unicode)
1714 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001715 if (size == 0) {
1716 if (consumed)
1717 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001719 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720
1721 /* Unpack UTF-8 encoded data */
1722 p = unicode->str;
1723 e = s + size;
1724
1725 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001726 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727
1728 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001729 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730 s++;
1731 continue;
1732 }
1733
1734 n = utf8_code_length[ch];
1735
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001736 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001737 if (consumed)
1738 break;
1739 else {
1740 errmsg = "unexpected end of data";
1741 startinpos = s-starts;
1742 endinpos = size;
1743 goto utf8Error;
1744 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001745 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001746
1747 switch (n) {
1748
1749 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001750 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001751 startinpos = s-starts;
1752 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001753 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754
1755 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001756 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001757 startinpos = s-starts;
1758 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001759 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001760
1761 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001762 if ((s[1] & 0xc0) != 0x80) {
1763 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001764 startinpos = s-starts;
1765 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001766 goto utf8Error;
1767 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001769 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001770 startinpos = s-starts;
1771 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001772 errmsg = "illegal encoding";
1773 goto utf8Error;
1774 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001776 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777 break;
1778
1779 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001780 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001781 (s[2] & 0xc0) != 0x80) {
1782 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001783 startinpos = s-starts;
1784 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001785 goto utf8Error;
1786 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001788 if (ch < 0x0800) {
1789 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001790 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001791
1792 XXX For wide builds (UCS-4) we should probably try
1793 to recombine the surrogates into a single code
1794 unit.
1795 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001796 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001797 startinpos = s-starts;
1798 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001799 goto utf8Error;
1800 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001802 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001803 break;
1804
1805 case 4:
1806 if ((s[1] & 0xc0) != 0x80 ||
1807 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001808 (s[3] & 0xc0) != 0x80) {
1809 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810 startinpos = s-starts;
1811 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001812 goto utf8Error;
1813 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001814 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1815 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1816 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001817 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001818 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001819 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001820 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001821 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001822 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001823 startinpos = s-starts;
1824 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001825 goto utf8Error;
1826 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001827#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001828 *p++ = (Py_UNICODE)ch;
1829#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001830 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001831
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001832 /* translate from 10000..10FFFF to 0..FFFF */
1833 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001834
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001835 /* high surrogate = top 10 bits added to D800 */
1836 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001837
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001838 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001839 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001840#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 break;
1842
1843 default:
1844 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001845 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001846 startinpos = s-starts;
1847 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001848 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849 }
1850 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001851 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001852
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001853 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001854 outpos = p-PyUnicode_AS_UNICODE(unicode);
1855 if (unicode_decode_call_errorhandler(
1856 errors, &errorHandler,
1857 "utf8", errmsg,
1858 starts, size, &startinpos, &endinpos, &exc, &s,
1859 (PyObject **)&unicode, &outpos, &p))
1860 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001861 }
Walter Dörwald69652032004-09-07 20:24:22 +00001862 if (consumed)
1863 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864
1865 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001866 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001867 goto onError;
1868
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001869 Py_XDECREF(errorHandler);
1870 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 return (PyObject *)unicode;
1872
1873onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001874 Py_XDECREF(errorHandler);
1875 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876 Py_DECREF(unicode);
1877 return NULL;
1878}
1879
Tim Peters602f7402002-04-27 18:03:26 +00001880/* Allocation strategy: if the string is short, convert into a stack buffer
1881 and allocate exactly as much space needed at the end. Else allocate the
1882 maximum possible needed (4 result bytes per Unicode character), and return
1883 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001884*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001885PyObject *
1886PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001887 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001888 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889{
Tim Peters602f7402002-04-27 18:03:26 +00001890#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001891
Martin v. Löwis18e16552006-02-15 17:27:45 +00001892 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001893 PyObject *v; /* result string object */
1894 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001895 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001896 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001897 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001898
Tim Peters602f7402002-04-27 18:03:26 +00001899 assert(s != NULL);
1900 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001901
Tim Peters602f7402002-04-27 18:03:26 +00001902 if (size <= MAX_SHORT_UNICHARS) {
1903 /* Write into the stack buffer; nallocated can't overflow.
1904 * At the end, we'll allocate exactly as much heap space as it
1905 * turns out we need.
1906 */
1907 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1908 v = NULL; /* will allocate after we're done */
1909 p = stackbuf;
1910 }
1911 else {
1912 /* Overallocate on the heap, and give the excess back at the end. */
1913 nallocated = size * 4;
1914 if (nallocated / 4 != size) /* overflow! */
1915 return PyErr_NoMemory();
1916 v = PyString_FromStringAndSize(NULL, nallocated);
1917 if (v == NULL)
1918 return NULL;
1919 p = PyString_AS_STRING(v);
1920 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001921
Tim Peters602f7402002-04-27 18:03:26 +00001922 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001923 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001924
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001925 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001926 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001928
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001930 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001931 *p++ = (char)(0xc0 | (ch >> 6));
1932 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001933 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001934 else {
Tim Peters602f7402002-04-27 18:03:26 +00001935 /* Encode UCS2 Unicode ordinals */
1936 if (ch < 0x10000) {
1937 /* Special case: check for high surrogate */
1938 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1939 Py_UCS4 ch2 = s[i];
1940 /* Check for low surrogate and combine the two to
1941 form a UCS4 value */
1942 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001943 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001944 i++;
1945 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001946 }
Tim Peters602f7402002-04-27 18:03:26 +00001947 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001948 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001949 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001950 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1951 *p++ = (char)(0x80 | (ch & 0x3f));
1952 continue;
1953 }
1954encodeUCS4:
1955 /* Encode UCS4 Unicode ordinals */
1956 *p++ = (char)(0xf0 | (ch >> 18));
1957 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1958 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1959 *p++ = (char)(0x80 | (ch & 0x3f));
1960 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001962
Tim Peters602f7402002-04-27 18:03:26 +00001963 if (v == NULL) {
1964 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001965 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001966 assert(nneeded <= nallocated);
1967 v = PyString_FromStringAndSize(stackbuf, nneeded);
1968 }
1969 else {
1970 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001971 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001972 assert(nneeded <= nallocated);
1973 _PyString_Resize(&v, nneeded);
1974 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001976
Tim Peters602f7402002-04-27 18:03:26 +00001977#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001978}
1979
Guido van Rossumd57fd912000-03-10 22:53:23 +00001980PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1981{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982 if (!PyUnicode_Check(unicode)) {
1983 PyErr_BadArgument();
1984 return NULL;
1985 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001986 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1987 PyUnicode_GET_SIZE(unicode),
1988 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989}
1990
Walter Dörwald6e390802007-08-17 16:41:28 +00001991/* --- UTF-32 Codec ------------------------------------------------------- */
1992
1993PyObject *
1994PyUnicode_DecodeUTF32(const char *s,
1995 Py_ssize_t size,
1996 const char *errors,
1997 int *byteorder)
1998{
1999 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2000}
2001
2002PyObject *
2003PyUnicode_DecodeUTF32Stateful(const char *s,
2004 Py_ssize_t size,
2005 const char *errors,
2006 int *byteorder,
2007 Py_ssize_t *consumed)
2008{
2009 const char *starts = s;
2010 Py_ssize_t startinpos;
2011 Py_ssize_t endinpos;
2012 Py_ssize_t outpos;
2013 PyUnicodeObject *unicode;
2014 Py_UNICODE *p;
2015#ifndef Py_UNICODE_WIDE
2016 int i, pairs;
2017#else
2018 const int pairs = 0;
2019#endif
2020 const unsigned char *q, *e;
2021 int bo = 0; /* assume native ordering by default */
2022 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002023 /* Offsets from q for retrieving bytes in the right order. */
2024#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2025 int iorder[] = {0, 1, 2, 3};
2026#else
2027 int iorder[] = {3, 2, 1, 0};
2028#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002029 PyObject *errorHandler = NULL;
2030 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002031 /* On narrow builds we split characters outside the BMP into two
2032 codepoints => count how much extra space we need. */
2033#ifndef Py_UNICODE_WIDE
2034 for (i = pairs = 0; i < size/4; i++)
2035 if (((Py_UCS4 *)s)[i] >= 0x10000)
2036 pairs++;
2037#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002038
2039 /* This might be one to much, because of a BOM */
2040 unicode = _PyUnicode_New((size+3)/4+pairs);
2041 if (!unicode)
2042 return NULL;
2043 if (size == 0)
2044 return (PyObject *)unicode;
2045
2046 /* Unpack UTF-32 encoded data */
2047 p = unicode->str;
2048 q = (unsigned char *)s;
2049 e = q + size;
2050
2051 if (byteorder)
2052 bo = *byteorder;
2053
2054 /* Check for BOM marks (U+FEFF) in the input and adjust current
2055 byte order setting accordingly. In native mode, the leading BOM
2056 mark is skipped, in all other modes, it is copied to the output
2057 stream as-is (giving a ZWNBSP character). */
2058 if (bo == 0) {
2059 if (size >= 4) {
2060 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2061 (q[iorder[1]] << 8) | q[iorder[0]];
2062#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2063 if (bom == 0x0000FEFF) {
2064 q += 4;
2065 bo = -1;
2066 }
2067 else if (bom == 0xFFFE0000) {
2068 q += 4;
2069 bo = 1;
2070 }
2071#else
2072 if (bom == 0x0000FEFF) {
2073 q += 4;
2074 bo = 1;
2075 }
2076 else if (bom == 0xFFFE0000) {
2077 q += 4;
2078 bo = -1;
2079 }
2080#endif
2081 }
2082 }
2083
2084 if (bo == -1) {
2085 /* force LE */
2086 iorder[0] = 0;
2087 iorder[1] = 1;
2088 iorder[2] = 2;
2089 iorder[3] = 3;
2090 }
2091 else if (bo == 1) {
2092 /* force BE */
2093 iorder[0] = 3;
2094 iorder[1] = 2;
2095 iorder[2] = 1;
2096 iorder[3] = 0;
2097 }
2098
2099 while (q < e) {
2100 Py_UCS4 ch;
2101 /* remaining bytes at the end? (size should be divisible by 4) */
2102 if (e-q<4) {
2103 if (consumed)
2104 break;
2105 errmsg = "truncated data";
2106 startinpos = ((const char *)q)-starts;
2107 endinpos = ((const char *)e)-starts;
2108 goto utf32Error;
2109 /* The remaining input chars are ignored if the callback
2110 chooses to skip the input */
2111 }
2112 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2113 (q[iorder[1]] << 8) | q[iorder[0]];
2114
2115 if (ch >= 0x110000)
2116 {
2117 errmsg = "codepoint not in range(0x110000)";
2118 startinpos = ((const char *)q)-starts;
2119 endinpos = startinpos+4;
2120 goto utf32Error;
2121 }
2122#ifndef Py_UNICODE_WIDE
2123 if (ch >= 0x10000)
2124 {
2125 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2126 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2127 }
2128 else
2129#endif
2130 *p++ = ch;
2131 q += 4;
2132 continue;
2133 utf32Error:
2134 outpos = p-PyUnicode_AS_UNICODE(unicode);
2135 if (unicode_decode_call_errorhandler(
2136 errors, &errorHandler,
2137 "utf32", errmsg,
2138 starts, size, &startinpos, &endinpos, &exc, &s,
2139 (PyObject **)&unicode, &outpos, &p))
2140 goto onError;
2141 }
2142
2143 if (byteorder)
2144 *byteorder = bo;
2145
2146 if (consumed)
2147 *consumed = (const char *)q-starts;
2148
2149 /* Adjust length */
2150 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2151 goto onError;
2152
2153 Py_XDECREF(errorHandler);
2154 Py_XDECREF(exc);
2155 return (PyObject *)unicode;
2156
2157onError:
2158 Py_DECREF(unicode);
2159 Py_XDECREF(errorHandler);
2160 Py_XDECREF(exc);
2161 return NULL;
2162}
2163
2164PyObject *
2165PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2166 Py_ssize_t size,
2167 const char *errors,
2168 int byteorder)
2169{
2170 PyObject *v;
2171 unsigned char *p;
2172#ifndef Py_UNICODE_WIDE
2173 int i, pairs;
2174#else
2175 const int pairs = 0;
2176#endif
2177 /* Offsets from p for storing byte pairs in the right order. */
2178#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2179 int iorder[] = {0, 1, 2, 3};
2180#else
2181 int iorder[] = {3, 2, 1, 0};
2182#endif
2183
2184#define STORECHAR(CH) \
2185 do { \
2186 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2187 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2188 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2189 p[iorder[0]] = (CH) & 0xff; \
2190 p += 4; \
2191 } while(0)
2192
2193 /* In narrow builds we can output surrogate pairs as one codepoint,
2194 so we need less space. */
2195#ifndef Py_UNICODE_WIDE
2196 for (i = pairs = 0; i < size-1; i++)
2197 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2198 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2199 pairs++;
2200#endif
2201 v = PyString_FromStringAndSize(NULL,
2202 4 * (size - pairs + (byteorder == 0)));
2203 if (v == NULL)
2204 return NULL;
2205
2206 p = (unsigned char *)PyString_AS_STRING(v);
2207 if (byteorder == 0)
2208 STORECHAR(0xFEFF);
2209 if (size == 0)
2210 return v;
2211
2212 if (byteorder == -1) {
2213 /* force LE */
2214 iorder[0] = 0;
2215 iorder[1] = 1;
2216 iorder[2] = 2;
2217 iorder[3] = 3;
2218 }
2219 else if (byteorder == 1) {
2220 /* force BE */
2221 iorder[0] = 3;
2222 iorder[1] = 2;
2223 iorder[2] = 1;
2224 iorder[3] = 0;
2225 }
2226
2227 while (size-- > 0) {
2228 Py_UCS4 ch = *s++;
2229#ifndef Py_UNICODE_WIDE
2230 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2231 Py_UCS4 ch2 = *s;
2232 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2233 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2234 s++;
2235 size--;
2236 }
2237 }
2238#endif
2239 STORECHAR(ch);
2240 }
2241 return v;
2242#undef STORECHAR
2243}
2244
2245PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2246{
2247 if (!PyUnicode_Check(unicode)) {
2248 PyErr_BadArgument();
2249 return NULL;
2250 }
2251 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2252 PyUnicode_GET_SIZE(unicode),
2253 NULL,
2254 0);
2255}
2256
Guido van Rossumd57fd912000-03-10 22:53:23 +00002257/* --- UTF-16 Codec ------------------------------------------------------- */
2258
Tim Peters772747b2001-08-09 22:21:55 +00002259PyObject *
2260PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002261 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002262 const char *errors,
2263 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002264{
Walter Dörwald69652032004-09-07 20:24:22 +00002265 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2266}
2267
2268PyObject *
2269PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002270 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002271 const char *errors,
2272 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002273 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002274{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002275 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002276 Py_ssize_t startinpos;
2277 Py_ssize_t endinpos;
2278 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279 PyUnicodeObject *unicode;
2280 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002281 const unsigned char *q, *e;
2282 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002283 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002284 /* Offsets from q for retrieving byte pairs in the right order. */
2285#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2286 int ihi = 1, ilo = 0;
2287#else
2288 int ihi = 0, ilo = 1;
2289#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002290 PyObject *errorHandler = NULL;
2291 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292
2293 /* Note: size will always be longer than the resulting Unicode
2294 character count */
2295 unicode = _PyUnicode_New(size);
2296 if (!unicode)
2297 return NULL;
2298 if (size == 0)
2299 return (PyObject *)unicode;
2300
2301 /* Unpack UTF-16 encoded data */
2302 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002303 q = (unsigned char *)s;
2304 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305
2306 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002307 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002309 /* Check for BOM marks (U+FEFF) in the input and adjust current
2310 byte order setting accordingly. In native mode, the leading BOM
2311 mark is skipped, in all other modes, it is copied to the output
2312 stream as-is (giving a ZWNBSP character). */
2313 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002314 if (size >= 2) {
2315 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002316#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002317 if (bom == 0xFEFF) {
2318 q += 2;
2319 bo = -1;
2320 }
2321 else if (bom == 0xFFFE) {
2322 q += 2;
2323 bo = 1;
2324 }
Tim Petersced69f82003-09-16 20:30:58 +00002325#else
Walter Dörwald69652032004-09-07 20:24:22 +00002326 if (bom == 0xFEFF) {
2327 q += 2;
2328 bo = 1;
2329 }
2330 else if (bom == 0xFFFE) {
2331 q += 2;
2332 bo = -1;
2333 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002334#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002335 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002336 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337
Tim Peters772747b2001-08-09 22:21:55 +00002338 if (bo == -1) {
2339 /* force LE */
2340 ihi = 1;
2341 ilo = 0;
2342 }
2343 else if (bo == 1) {
2344 /* force BE */
2345 ihi = 0;
2346 ilo = 1;
2347 }
2348
2349 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002350 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002351 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002352 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002353 if (consumed)
2354 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002355 errmsg = "truncated data";
2356 startinpos = ((const char *)q)-starts;
2357 endinpos = ((const char *)e)-starts;
2358 goto utf16Error;
2359 /* The remaining input chars are ignored if the callback
2360 chooses to skip the input */
2361 }
2362 ch = (q[ihi] << 8) | q[ilo];
2363
Tim Peters772747b2001-08-09 22:21:55 +00002364 q += 2;
2365
Guido van Rossumd57fd912000-03-10 22:53:23 +00002366 if (ch < 0xD800 || ch > 0xDFFF) {
2367 *p++ = ch;
2368 continue;
2369 }
2370
2371 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002372 if (q >= e) {
2373 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002374 startinpos = (((const char *)q)-2)-starts;
2375 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002376 goto utf16Error;
2377 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002378 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002379 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2380 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002381 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002382#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002383 *p++ = ch;
2384 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002385#else
2386 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002387#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002388 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002389 }
2390 else {
2391 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002392 startinpos = (((const char *)q)-4)-starts;
2393 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002394 goto utf16Error;
2395 }
2396
Guido van Rossumd57fd912000-03-10 22:53:23 +00002397 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002398 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002399 startinpos = (((const char *)q)-2)-starts;
2400 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002401 /* Fall through to report the error */
2402
2403 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002404 outpos = p-PyUnicode_AS_UNICODE(unicode);
2405 if (unicode_decode_call_errorhandler(
2406 errors, &errorHandler,
2407 "utf16", errmsg,
2408 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2409 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002410 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002411 }
2412
2413 if (byteorder)
2414 *byteorder = bo;
2415
Walter Dörwald69652032004-09-07 20:24:22 +00002416 if (consumed)
2417 *consumed = (const char *)q-starts;
2418
Guido van Rossumd57fd912000-03-10 22:53:23 +00002419 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002420 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002421 goto onError;
2422
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002423 Py_XDECREF(errorHandler);
2424 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002425 return (PyObject *)unicode;
2426
2427onError:
2428 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002429 Py_XDECREF(errorHandler);
2430 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002431 return NULL;
2432}
2433
Tim Peters772747b2001-08-09 22:21:55 +00002434PyObject *
2435PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002436 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002437 const char *errors,
2438 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002439{
2440 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002441 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002442#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002443 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002444#else
2445 const int pairs = 0;
2446#endif
Tim Peters772747b2001-08-09 22:21:55 +00002447 /* Offsets from p for storing byte pairs in the right order. */
2448#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2449 int ihi = 1, ilo = 0;
2450#else
2451 int ihi = 0, ilo = 1;
2452#endif
2453
2454#define STORECHAR(CH) \
2455 do { \
2456 p[ihi] = ((CH) >> 8) & 0xff; \
2457 p[ilo] = (CH) & 0xff; \
2458 p += 2; \
2459 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002461#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002462 for (i = pairs = 0; i < size; i++)
2463 if (s[i] >= 0x10000)
2464 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002465#endif
Tim Petersced69f82003-09-16 20:30:58 +00002466 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002467 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002468 if (v == NULL)
2469 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002470
Tim Peters772747b2001-08-09 22:21:55 +00002471 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002472 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002473 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002474 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002475 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002476
2477 if (byteorder == -1) {
2478 /* force LE */
2479 ihi = 1;
2480 ilo = 0;
2481 }
2482 else if (byteorder == 1) {
2483 /* force BE */
2484 ihi = 0;
2485 ilo = 1;
2486 }
2487
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002488 while (size-- > 0) {
2489 Py_UNICODE ch = *s++;
2490 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002491#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002492 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002493 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2494 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002496#endif
Tim Peters772747b2001-08-09 22:21:55 +00002497 STORECHAR(ch);
2498 if (ch2)
2499 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002500 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002501 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002502#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503}
2504
2505PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2506{
2507 if (!PyUnicode_Check(unicode)) {
2508 PyErr_BadArgument();
2509 return NULL;
2510 }
2511 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2512 PyUnicode_GET_SIZE(unicode),
2513 NULL,
2514 0);
2515}
2516
2517/* --- Unicode Escape Codec ----------------------------------------------- */
2518
Fredrik Lundh06d12682001-01-24 07:59:11 +00002519static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002520
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002522 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002523 const char *errors)
2524{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002525 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002526 Py_ssize_t startinpos;
2527 Py_ssize_t endinpos;
2528 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002529 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002531 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002533 char* message;
2534 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002535 PyObject *errorHandler = NULL;
2536 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002537
Guido van Rossumd57fd912000-03-10 22:53:23 +00002538 /* Escaped strings will always be longer than the resulting
2539 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002540 length after conversion to the true value.
2541 (but if the error callback returns a long replacement string
2542 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543 v = _PyUnicode_New(size);
2544 if (v == NULL)
2545 goto onError;
2546 if (size == 0)
2547 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002548
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002549 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002551
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552 while (s < end) {
2553 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002554 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002555 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556
2557 /* Non-escape characters are interpreted as Unicode ordinals */
2558 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002559 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002560 continue;
2561 }
2562
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002563 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002564 /* \ - Escapes */
2565 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002566 c = *s++;
2567 if (s > end)
2568 c = '\0'; /* Invalid after \ */
2569 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002570
2571 /* \x escapes */
2572 case '\n': break;
2573 case '\\': *p++ = '\\'; break;
2574 case '\'': *p++ = '\''; break;
2575 case '\"': *p++ = '\"'; break;
2576 case 'b': *p++ = '\b'; break;
2577 case 'f': *p++ = '\014'; break; /* FF */
2578 case 't': *p++ = '\t'; break;
2579 case 'n': *p++ = '\n'; break;
2580 case 'r': *p++ = '\r'; break;
2581 case 'v': *p++ = '\013'; break; /* VT */
2582 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2583
2584 /* \OOO (octal) escapes */
2585 case '0': case '1': case '2': case '3':
2586 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002587 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002588 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002589 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002590 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002591 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002593 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594 break;
2595
Fredrik Lundhccc74732001-02-18 22:13:49 +00002596 /* hex escapes */
2597 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002599 digits = 2;
2600 message = "truncated \\xXX escape";
2601 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602
Fredrik Lundhccc74732001-02-18 22:13:49 +00002603 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002605 digits = 4;
2606 message = "truncated \\uXXXX escape";
2607 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608
Fredrik Lundhccc74732001-02-18 22:13:49 +00002609 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002610 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002611 digits = 8;
2612 message = "truncated \\UXXXXXXXX escape";
2613 hexescape:
2614 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002615 outpos = p-PyUnicode_AS_UNICODE(v);
2616 if (s+digits>end) {
2617 endinpos = size;
2618 if (unicode_decode_call_errorhandler(
2619 errors, &errorHandler,
2620 "unicodeescape", "end of string in escape sequence",
2621 starts, size, &startinpos, &endinpos, &exc, &s,
2622 (PyObject **)&v, &outpos, &p))
2623 goto onError;
2624 goto nextByte;
2625 }
2626 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002627 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002628 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002629 endinpos = (s+i+1)-starts;
2630 if (unicode_decode_call_errorhandler(
2631 errors, &errorHandler,
2632 "unicodeescape", message,
2633 starts, size, &startinpos, &endinpos, &exc, &s,
2634 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002635 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002636 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002637 }
2638 chr = (chr<<4) & ~0xF;
2639 if (c >= '0' && c <= '9')
2640 chr += c - '0';
2641 else if (c >= 'a' && c <= 'f')
2642 chr += 10 + c - 'a';
2643 else
2644 chr += 10 + c - 'A';
2645 }
2646 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002647 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002648 /* _decoding_error will have already written into the
2649 target buffer. */
2650 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002651 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002652 /* when we get here, chr is a 32-bit unicode character */
2653 if (chr <= 0xffff)
2654 /* UCS-2 character */
2655 *p++ = (Py_UNICODE) chr;
2656 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002657 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002658 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002659#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002660 *p++ = chr;
2661#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002662 chr -= 0x10000L;
2663 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002664 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002665#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002666 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002667 endinpos = s-starts;
2668 outpos = p-PyUnicode_AS_UNICODE(v);
2669 if (unicode_decode_call_errorhandler(
2670 errors, &errorHandler,
2671 "unicodeescape", "illegal Unicode character",
2672 starts, size, &startinpos, &endinpos, &exc, &s,
2673 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002674 goto onError;
2675 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002676 break;
2677
2678 /* \N{name} */
2679 case 'N':
2680 message = "malformed \\N character escape";
2681 if (ucnhash_CAPI == NULL) {
2682 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002683 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002684 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002685 if (m == NULL)
2686 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002687 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002688 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002689 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002690 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002691 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002692 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002693 if (ucnhash_CAPI == NULL)
2694 goto ucnhashError;
2695 }
2696 if (*s == '{') {
2697 const char *start = s+1;
2698 /* look for the closing brace */
2699 while (*s != '}' && s < end)
2700 s++;
2701 if (s > start && s < end && *s == '}') {
2702 /* found a name. look it up in the unicode database */
2703 message = "unknown Unicode character name";
2704 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002705 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002706 goto store;
2707 }
2708 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002709 endinpos = s-starts;
2710 outpos = p-PyUnicode_AS_UNICODE(v);
2711 if (unicode_decode_call_errorhandler(
2712 errors, &errorHandler,
2713 "unicodeescape", message,
2714 starts, size, &startinpos, &endinpos, &exc, &s,
2715 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002716 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002717 break;
2718
2719 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002720 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002721 message = "\\ at end of string";
2722 s--;
2723 endinpos = s-starts;
2724 outpos = p-PyUnicode_AS_UNICODE(v);
2725 if (unicode_decode_call_errorhandler(
2726 errors, &errorHandler,
2727 "unicodeescape", message,
2728 starts, size, &startinpos, &endinpos, &exc, &s,
2729 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002730 goto onError;
2731 }
2732 else {
2733 *p++ = '\\';
2734 *p++ = (unsigned char)s[-1];
2735 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002736 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 nextByte:
2739 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002741 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002743 Py_XDECREF(errorHandler);
2744 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002746
Fredrik Lundhccc74732001-02-18 22:13:49 +00002747ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002748 PyErr_SetString(
2749 PyExc_UnicodeError,
2750 "\\N escapes not supported (can't load unicodedata module)"
2751 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002752 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002753 Py_XDECREF(errorHandler);
2754 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002755 return NULL;
2756
Fredrik Lundhccc74732001-02-18 22:13:49 +00002757onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002759 Py_XDECREF(errorHandler);
2760 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 return NULL;
2762}
2763
2764/* Return a Unicode-Escape string version of the Unicode object.
2765
2766 If quotes is true, the string is enclosed in u"" or u'' quotes as
2767 appropriate.
2768
2769*/
2770
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002771Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002772 Py_ssize_t size,
2773 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002774{
2775 /* like wcschr, but doesn't stop at NULL characters */
2776
2777 while (size-- > 0) {
2778 if (*s == ch)
2779 return s;
2780 s++;
2781 }
2782
2783 return NULL;
2784}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002785
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786static
2787PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002788 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789 int quotes)
2790{
2791 PyObject *repr;
2792 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002793
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002794 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795
Neal Norwitz17753ec2006-08-21 22:21:19 +00002796 /* XXX(nnorwitz): rather than over-allocating, it would be
2797 better to choose a different scheme. Perhaps scan the
2798 first N-chars of the string and allocate based on that size.
2799 */
2800 /* Initial allocation is based on the longest-possible unichr
2801 escape.
2802
2803 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2804 unichr, so in this case it's the longest unichr escape. In
2805 narrow (UTF-16) builds this is five chars per source unichr
2806 since there are two unichrs in the surrogate pair, so in narrow
2807 (UTF-16) builds it's not the longest unichr escape.
2808
2809 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2810 so in the narrow (UTF-16) build case it's the longest unichr
2811 escape.
2812 */
2813
2814 repr = PyString_FromStringAndSize(NULL,
2815 2
2816#ifdef Py_UNICODE_WIDE
2817 + 10*size
2818#else
2819 + 6*size
2820#endif
2821 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822 if (repr == NULL)
2823 return NULL;
2824
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002825 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826
2827 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002829 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830 !findchar(s, size, '"')) ? '"' : '\'';
2831 }
2832 while (size-- > 0) {
2833 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002834
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002835 /* Escape quotes and backslashes */
2836 if ((quotes &&
2837 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838 *p++ = '\\';
2839 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002840 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002841 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002842
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002843#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002844 /* Map 21-bit characters to '\U00xxxxxx' */
2845 else if (ch >= 0x10000) {
2846 *p++ = '\\';
2847 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002848 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2849 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2850 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2851 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2852 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2853 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2854 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002855 *p++ = hexdigit[ch & 0x0000000F];
2856 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002857 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002858#else
2859 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002860 else if (ch >= 0xD800 && ch < 0xDC00) {
2861 Py_UNICODE ch2;
2862 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002863
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002864 ch2 = *s++;
2865 size--;
2866 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2867 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2868 *p++ = '\\';
2869 *p++ = 'U';
2870 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2871 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2872 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2873 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2874 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2875 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2876 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2877 *p++ = hexdigit[ucs & 0x0000000F];
2878 continue;
2879 }
2880 /* Fall through: isolated surrogates are copied as-is */
2881 s--;
2882 size++;
2883 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002884#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002885
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002887 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888 *p++ = '\\';
2889 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002890 *p++ = hexdigit[(ch >> 12) & 0x000F];
2891 *p++ = hexdigit[(ch >> 8) & 0x000F];
2892 *p++ = hexdigit[(ch >> 4) & 0x000F];
2893 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002895
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002896 /* Map special whitespace to '\t', \n', '\r' */
2897 else if (ch == '\t') {
2898 *p++ = '\\';
2899 *p++ = 't';
2900 }
2901 else if (ch == '\n') {
2902 *p++ = '\\';
2903 *p++ = 'n';
2904 }
2905 else if (ch == '\r') {
2906 *p++ = '\\';
2907 *p++ = 'r';
2908 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002909
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002910 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002911 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002913 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002914 *p++ = hexdigit[(ch >> 4) & 0x000F];
2915 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002916 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002917
Guido van Rossumd57fd912000-03-10 22:53:23 +00002918 /* Copy everything else as-is */
2919 else
2920 *p++ = (char) ch;
2921 }
2922 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002923 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002924
2925 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002926 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002927 return repr;
2928}
2929
2930PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002931 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932{
2933 return unicodeescape_string(s, size, 0);
2934}
2935
2936PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2937{
2938 if (!PyUnicode_Check(unicode)) {
2939 PyErr_BadArgument();
2940 return NULL;
2941 }
2942 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2943 PyUnicode_GET_SIZE(unicode));
2944}
2945
2946/* --- Raw Unicode Escape Codec ------------------------------------------- */
2947
2948PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002949 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002950 const char *errors)
2951{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002952 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002953 Py_ssize_t startinpos;
2954 Py_ssize_t endinpos;
2955 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002957 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958 const char *end;
2959 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002960 PyObject *errorHandler = NULL;
2961 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002962
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963 /* Escaped strings will always be longer than the resulting
2964 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002965 length after conversion to the true value. (But decoding error
2966 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 v = _PyUnicode_New(size);
2968 if (v == NULL)
2969 goto onError;
2970 if (size == 0)
2971 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002972 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002973 end = s + size;
2974 while (s < end) {
2975 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002976 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002978 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979
2980 /* Non-escape characters are interpreted as Unicode ordinals */
2981 if (*s != '\\') {
2982 *p++ = (unsigned char)*s++;
2983 continue;
2984 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002985 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986
2987 /* \u-escapes are only interpreted iff the number of leading
2988 backslashes if odd */
2989 bs = s;
2990 for (;s < end;) {
2991 if (*s != '\\')
2992 break;
2993 *p++ = (unsigned char)*s++;
2994 }
2995 if (((s - bs) & 1) == 0 ||
2996 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002997 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 continue;
2999 }
3000 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003001 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003002 s++;
3003
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003004 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003005 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003006 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003007 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003008 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003009 endinpos = s-starts;
3010 if (unicode_decode_call_errorhandler(
3011 errors, &errorHandler,
3012 "rawunicodeescape", "truncated \\uXXXX",
3013 starts, size, &startinpos, &endinpos, &exc, &s,
3014 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003016 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 }
3018 x = (x<<4) & ~0xF;
3019 if (c >= '0' && c <= '9')
3020 x += c - '0';
3021 else if (c >= 'a' && c <= 'f')
3022 x += 10 + c - 'a';
3023 else
3024 x += 10 + c - 'A';
3025 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003026#ifndef Py_UNICODE_WIDE
3027 if (x > 0x10000) {
3028 if (unicode_decode_call_errorhandler(
3029 errors, &errorHandler,
3030 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3031 starts, size, &startinpos, &endinpos, &exc, &s,
3032 (PyObject **)&v, &outpos, &p))
3033 goto onError;
3034 }
3035#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003036 *p++ = x;
3037 nextByte:
3038 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003040 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003041 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003042 Py_XDECREF(errorHandler);
3043 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003045
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 onError:
3047 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048 Py_XDECREF(errorHandler);
3049 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 return NULL;
3051}
3052
3053PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003054 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055{
3056 PyObject *repr;
3057 char *p;
3058 char *q;
3059
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003060 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003062#ifdef Py_UNICODE_WIDE
3063 repr = PyString_FromStringAndSize(NULL, 10 * size);
3064#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003066#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 if (repr == NULL)
3068 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003069 if (size == 0)
3070 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071
3072 p = q = PyString_AS_STRING(repr);
3073 while (size-- > 0) {
3074 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003075#ifdef Py_UNICODE_WIDE
3076 /* Map 32-bit characters to '\Uxxxxxxxx' */
3077 if (ch >= 0x10000) {
3078 *p++ = '\\';
3079 *p++ = 'U';
3080 *p++ = hexdigit[(ch >> 28) & 0xf];
3081 *p++ = hexdigit[(ch >> 24) & 0xf];
3082 *p++ = hexdigit[(ch >> 20) & 0xf];
3083 *p++ = hexdigit[(ch >> 16) & 0xf];
3084 *p++ = hexdigit[(ch >> 12) & 0xf];
3085 *p++ = hexdigit[(ch >> 8) & 0xf];
3086 *p++ = hexdigit[(ch >> 4) & 0xf];
3087 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003088 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003089 else
3090#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 /* Map 16-bit characters to '\uxxxx' */
3092 if (ch >= 256) {
3093 *p++ = '\\';
3094 *p++ = 'u';
3095 *p++ = hexdigit[(ch >> 12) & 0xf];
3096 *p++ = hexdigit[(ch >> 8) & 0xf];
3097 *p++ = hexdigit[(ch >> 4) & 0xf];
3098 *p++ = hexdigit[ch & 15];
3099 }
3100 /* Copy everything else as-is */
3101 else
3102 *p++ = (char) ch;
3103 }
3104 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00003105 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003106 return repr;
3107}
3108
3109PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3110{
3111 if (!PyUnicode_Check(unicode)) {
3112 PyErr_BadArgument();
3113 return NULL;
3114 }
3115 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3116 PyUnicode_GET_SIZE(unicode));
3117}
3118
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003119/* --- Unicode Internal Codec ------------------------------------------- */
3120
3121PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003122 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003123 const char *errors)
3124{
3125 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003126 Py_ssize_t startinpos;
3127 Py_ssize_t endinpos;
3128 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003129 PyUnicodeObject *v;
3130 Py_UNICODE *p;
3131 const char *end;
3132 const char *reason;
3133 PyObject *errorHandler = NULL;
3134 PyObject *exc = NULL;
3135
Neal Norwitzd43069c2006-01-08 01:12:10 +00003136#ifdef Py_UNICODE_WIDE
3137 Py_UNICODE unimax = PyUnicode_GetMax();
3138#endif
3139
Armin Rigo7ccbca92006-10-04 12:17:45 +00003140 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003141 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3142 if (v == NULL)
3143 goto onError;
3144 if (PyUnicode_GetSize((PyObject *)v) == 0)
3145 return (PyObject *)v;
3146 p = PyUnicode_AS_UNICODE(v);
3147 end = s + size;
3148
3149 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003150 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003151 /* We have to sanity check the raw data, otherwise doom looms for
3152 some malformed UCS-4 data. */
3153 if (
3154 #ifdef Py_UNICODE_WIDE
3155 *p > unimax || *p < 0 ||
3156 #endif
3157 end-s < Py_UNICODE_SIZE
3158 )
3159 {
3160 startinpos = s - starts;
3161 if (end-s < Py_UNICODE_SIZE) {
3162 endinpos = end-starts;
3163 reason = "truncated input";
3164 }
3165 else {
3166 endinpos = s - starts + Py_UNICODE_SIZE;
3167 reason = "illegal code point (> 0x10FFFF)";
3168 }
3169 outpos = p - PyUnicode_AS_UNICODE(v);
3170 if (unicode_decode_call_errorhandler(
3171 errors, &errorHandler,
3172 "unicode_internal", reason,
3173 starts, size, &startinpos, &endinpos, &exc, &s,
3174 (PyObject **)&v, &outpos, &p)) {
3175 goto onError;
3176 }
3177 }
3178 else {
3179 p++;
3180 s += Py_UNICODE_SIZE;
3181 }
3182 }
3183
Martin v. Löwis412fb672006-04-13 06:34:32 +00003184 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003185 goto onError;
3186 Py_XDECREF(errorHandler);
3187 Py_XDECREF(exc);
3188 return (PyObject *)v;
3189
3190 onError:
3191 Py_XDECREF(v);
3192 Py_XDECREF(errorHandler);
3193 Py_XDECREF(exc);
3194 return NULL;
3195}
3196
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197/* --- Latin-1 Codec ------------------------------------------------------ */
3198
3199PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003200 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003201 const char *errors)
3202{
3203 PyUnicodeObject *v;
3204 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003205
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003207 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003208 Py_UNICODE r = *(unsigned char*)s;
3209 return PyUnicode_FromUnicode(&r, 1);
3210 }
3211
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 v = _PyUnicode_New(size);
3213 if (v == NULL)
3214 goto onError;
3215 if (size == 0)
3216 return (PyObject *)v;
3217 p = PyUnicode_AS_UNICODE(v);
3218 while (size-- > 0)
3219 *p++ = (unsigned char)*s++;
3220 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003221
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222 onError:
3223 Py_XDECREF(v);
3224 return NULL;
3225}
3226
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003227/* create or adjust a UnicodeEncodeError */
3228static void make_encode_exception(PyObject **exceptionObject,
3229 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003230 const Py_UNICODE *unicode, Py_ssize_t size,
3231 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003232 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003233{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003234 if (*exceptionObject == NULL) {
3235 *exceptionObject = PyUnicodeEncodeError_Create(
3236 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 }
3238 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003239 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3240 goto onError;
3241 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3242 goto onError;
3243 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3244 goto onError;
3245 return;
3246 onError:
3247 Py_DECREF(*exceptionObject);
3248 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 }
3250}
3251
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003252/* raises a UnicodeEncodeError */
3253static void raise_encode_exception(PyObject **exceptionObject,
3254 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003255 const Py_UNICODE *unicode, Py_ssize_t size,
3256 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003257 const char *reason)
3258{
3259 make_encode_exception(exceptionObject,
3260 encoding, unicode, size, startpos, endpos, reason);
3261 if (*exceptionObject != NULL)
3262 PyCodec_StrictErrors(*exceptionObject);
3263}
3264
3265/* error handling callback helper:
3266 build arguments, call the callback and check the arguments,
3267 put the result into newpos and return the replacement string, which
3268 has to be freed by the caller */
3269static PyObject *unicode_encode_call_errorhandler(const char *errors,
3270 PyObject **errorHandler,
3271 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003272 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3273 Py_ssize_t startpos, Py_ssize_t endpos,
3274 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003275{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003276 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003277
3278 PyObject *restuple;
3279 PyObject *resunicode;
3280
3281 if (*errorHandler == NULL) {
3282 *errorHandler = PyCodec_LookupError(errors);
3283 if (*errorHandler == NULL)
3284 return NULL;
3285 }
3286
3287 make_encode_exception(exceptionObject,
3288 encoding, unicode, size, startpos, endpos, reason);
3289 if (*exceptionObject == NULL)
3290 return NULL;
3291
3292 restuple = PyObject_CallFunctionObjArgs(
3293 *errorHandler, *exceptionObject, NULL);
3294 if (restuple == NULL)
3295 return NULL;
3296 if (!PyTuple_Check(restuple)) {
3297 PyErr_Format(PyExc_TypeError, &argparse[4]);
3298 Py_DECREF(restuple);
3299 return NULL;
3300 }
3301 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3302 &resunicode, newpos)) {
3303 Py_DECREF(restuple);
3304 return NULL;
3305 }
3306 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003307 *newpos = size+*newpos;
3308 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003309 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003310 Py_DECREF(restuple);
3311 return NULL;
3312 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003313 Py_INCREF(resunicode);
3314 Py_DECREF(restuple);
3315 return resunicode;
3316}
3317
3318static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003319 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003320 const char *errors,
3321 int limit)
3322{
3323 /* output object */
3324 PyObject *res;
3325 /* pointers to the beginning and end+1 of input */
3326 const Py_UNICODE *startp = p;
3327 const Py_UNICODE *endp = p + size;
3328 /* pointer to the beginning of the unencodable characters */
3329 /* const Py_UNICODE *badp = NULL; */
3330 /* pointer into the output */
3331 char *str;
3332 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003333 Py_ssize_t respos = 0;
3334 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003335 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3336 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337 PyObject *errorHandler = NULL;
3338 PyObject *exc = NULL;
3339 /* the following variable is used for caching string comparisons
3340 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3341 int known_errorHandler = -1;
3342
3343 /* allocate enough for a simple encoding without
3344 replacements, if we need more, we'll resize */
3345 res = PyString_FromStringAndSize(NULL, size);
3346 if (res == NULL)
3347 goto onError;
3348 if (size == 0)
3349 return res;
3350 str = PyString_AS_STRING(res);
3351 ressize = size;
3352
3353 while (p<endp) {
3354 Py_UNICODE c = *p;
3355
3356 /* can we encode this? */
3357 if (c<limit) {
3358 /* no overflow check, because we know that the space is enough */
3359 *str++ = (char)c;
3360 ++p;
3361 }
3362 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003363 Py_ssize_t unicodepos = p-startp;
3364 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003365 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003366 Py_ssize_t repsize;
3367 Py_ssize_t newpos;
3368 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003369 Py_UNICODE *uni2;
3370 /* startpos for collecting unencodable chars */
3371 const Py_UNICODE *collstart = p;
3372 const Py_UNICODE *collend = p;
3373 /* find all unecodable characters */
3374 while ((collend < endp) && ((*collend)>=limit))
3375 ++collend;
3376 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3377 if (known_errorHandler==-1) {
3378 if ((errors==NULL) || (!strcmp(errors, "strict")))
3379 known_errorHandler = 1;
3380 else if (!strcmp(errors, "replace"))
3381 known_errorHandler = 2;
3382 else if (!strcmp(errors, "ignore"))
3383 known_errorHandler = 3;
3384 else if (!strcmp(errors, "xmlcharrefreplace"))
3385 known_errorHandler = 4;
3386 else
3387 known_errorHandler = 0;
3388 }
3389 switch (known_errorHandler) {
3390 case 1: /* strict */
3391 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3392 goto onError;
3393 case 2: /* replace */
3394 while (collstart++<collend)
3395 *str++ = '?'; /* fall through */
3396 case 3: /* ignore */
3397 p = collend;
3398 break;
3399 case 4: /* xmlcharrefreplace */
3400 respos = str-PyString_AS_STRING(res);
3401 /* determine replacement size (temporarily (mis)uses p) */
3402 for (p = collstart, repsize = 0; p < collend; ++p) {
3403 if (*p<10)
3404 repsize += 2+1+1;
3405 else if (*p<100)
3406 repsize += 2+2+1;
3407 else if (*p<1000)
3408 repsize += 2+3+1;
3409 else if (*p<10000)
3410 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003411#ifndef Py_UNICODE_WIDE
3412 else
3413 repsize += 2+5+1;
3414#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003415 else if (*p<100000)
3416 repsize += 2+5+1;
3417 else if (*p<1000000)
3418 repsize += 2+6+1;
3419 else
3420 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003421#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003422 }
3423 requiredsize = respos+repsize+(endp-collend);
3424 if (requiredsize > ressize) {
3425 if (requiredsize<2*ressize)
3426 requiredsize = 2*ressize;
3427 if (_PyString_Resize(&res, requiredsize))
3428 goto onError;
3429 str = PyString_AS_STRING(res) + respos;
3430 ressize = requiredsize;
3431 }
3432 /* generate replacement (temporarily (mis)uses p) */
3433 for (p = collstart; p < collend; ++p) {
3434 str += sprintf(str, "&#%d;", (int)*p);
3435 }
3436 p = collend;
3437 break;
3438 default:
3439 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3440 encoding, reason, startp, size, &exc,
3441 collstart-startp, collend-startp, &newpos);
3442 if (repunicode == NULL)
3443 goto onError;
3444 /* need more space? (at least enough for what we
3445 have+the replacement+the rest of the string, so
3446 we won't have to check space for encodable characters) */
3447 respos = str-PyString_AS_STRING(res);
3448 repsize = PyUnicode_GET_SIZE(repunicode);
3449 requiredsize = respos+repsize+(endp-collend);
3450 if (requiredsize > ressize) {
3451 if (requiredsize<2*ressize)
3452 requiredsize = 2*ressize;
3453 if (_PyString_Resize(&res, requiredsize)) {
3454 Py_DECREF(repunicode);
3455 goto onError;
3456 }
3457 str = PyString_AS_STRING(res) + respos;
3458 ressize = requiredsize;
3459 }
3460 /* check if there is anything unencodable in the replacement
3461 and copy it to the output */
3462 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3463 c = *uni2;
3464 if (c >= limit) {
3465 raise_encode_exception(&exc, encoding, startp, size,
3466 unicodepos, unicodepos+1, reason);
3467 Py_DECREF(repunicode);
3468 goto onError;
3469 }
3470 *str = (char)c;
3471 }
3472 p = startp + newpos;
3473 Py_DECREF(repunicode);
3474 }
3475 }
3476 }
3477 /* Resize if we allocated to much */
3478 respos = str-PyString_AS_STRING(res);
3479 if (respos<ressize)
3480 /* If this falls res will be NULL */
3481 _PyString_Resize(&res, respos);
3482 Py_XDECREF(errorHandler);
3483 Py_XDECREF(exc);
3484 return res;
3485
3486 onError:
3487 Py_XDECREF(res);
3488 Py_XDECREF(errorHandler);
3489 Py_XDECREF(exc);
3490 return NULL;
3491}
3492
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003494 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495 const char *errors)
3496{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003498}
3499
3500PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3501{
3502 if (!PyUnicode_Check(unicode)) {
3503 PyErr_BadArgument();
3504 return NULL;
3505 }
3506 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3507 PyUnicode_GET_SIZE(unicode),
3508 NULL);
3509}
3510
3511/* --- 7-bit ASCII Codec -------------------------------------------------- */
3512
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003514 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515 const char *errors)
3516{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003517 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003518 PyUnicodeObject *v;
3519 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003520 Py_ssize_t startinpos;
3521 Py_ssize_t endinpos;
3522 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523 const char *e;
3524 PyObject *errorHandler = NULL;
3525 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003526
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003528 if (size == 1 && *(unsigned char*)s < 128) {
3529 Py_UNICODE r = *(unsigned char*)s;
3530 return PyUnicode_FromUnicode(&r, 1);
3531 }
Tim Petersced69f82003-09-16 20:30:58 +00003532
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533 v = _PyUnicode_New(size);
3534 if (v == NULL)
3535 goto onError;
3536 if (size == 0)
3537 return (PyObject *)v;
3538 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 e = s + size;
3540 while (s < e) {
3541 register unsigned char c = (unsigned char)*s;
3542 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544 ++s;
3545 }
3546 else {
3547 startinpos = s-starts;
3548 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003549 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003550 if (unicode_decode_call_errorhandler(
3551 errors, &errorHandler,
3552 "ascii", "ordinal not in range(128)",
3553 starts, size, &startinpos, &endinpos, &exc, &s,
3554 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003557 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003558 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003559 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003560 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561 Py_XDECREF(errorHandler);
3562 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003564
Guido van Rossumd57fd912000-03-10 22:53:23 +00003565 onError:
3566 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567 Py_XDECREF(errorHandler);
3568 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003569 return NULL;
3570}
3571
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003573 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003574 const char *errors)
3575{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003576 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003577}
3578
3579PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3580{
3581 if (!PyUnicode_Check(unicode)) {
3582 PyErr_BadArgument();
3583 return NULL;
3584 }
3585 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3586 PyUnicode_GET_SIZE(unicode),
3587 NULL);
3588}
3589
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003591
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003592/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003593
Martin v. Löwisd8251432006-06-14 05:21:04 +00003594#if SIZEOF_INT < SIZEOF_SSIZE_T
3595#define NEED_RETRY
3596#endif
3597
3598/* XXX This code is limited to "true" double-byte encodings, as
3599 a) it assumes an incomplete character consists of a single byte, and
3600 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3601 encodings, see IsDBCSLeadByteEx documentation. */
3602
3603static int is_dbcs_lead_byte(const char *s, int offset)
3604{
3605 const char *curr = s + offset;
3606
3607 if (IsDBCSLeadByte(*curr)) {
3608 const char *prev = CharPrev(s, curr);
3609 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3610 }
3611 return 0;
3612}
3613
3614/*
3615 * Decode MBCS string into unicode object. If 'final' is set, converts
3616 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3617 */
3618static int decode_mbcs(PyUnicodeObject **v,
3619 const char *s, /* MBCS string */
3620 int size, /* sizeof MBCS string */
3621 int final)
3622{
3623 Py_UNICODE *p;
3624 Py_ssize_t n = 0;
3625 int usize = 0;
3626
3627 assert(size >= 0);
3628
3629 /* Skip trailing lead-byte unless 'final' is set */
3630 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3631 --size;
3632
3633 /* First get the size of the result */
3634 if (size > 0) {
3635 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3636 if (usize == 0) {
3637 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3638 return -1;
3639 }
3640 }
3641
3642 if (*v == NULL) {
3643 /* Create unicode object */
3644 *v = _PyUnicode_New(usize);
3645 if (*v == NULL)
3646 return -1;
3647 }
3648 else {
3649 /* Extend unicode object */
3650 n = PyUnicode_GET_SIZE(*v);
3651 if (_PyUnicode_Resize(v, n + usize) < 0)
3652 return -1;
3653 }
3654
3655 /* Do the conversion */
3656 if (size > 0) {
3657 p = PyUnicode_AS_UNICODE(*v) + n;
3658 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3659 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3660 return -1;
3661 }
3662 }
3663
3664 return size;
3665}
3666
3667PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3668 Py_ssize_t size,
3669 const char *errors,
3670 Py_ssize_t *consumed)
3671{
3672 PyUnicodeObject *v = NULL;
3673 int done;
3674
3675 if (consumed)
3676 *consumed = 0;
3677
3678#ifdef NEED_RETRY
3679 retry:
3680 if (size > INT_MAX)
3681 done = decode_mbcs(&v, s, INT_MAX, 0);
3682 else
3683#endif
3684 done = decode_mbcs(&v, s, (int)size, !consumed);
3685
3686 if (done < 0) {
3687 Py_XDECREF(v);
3688 return NULL;
3689 }
3690
3691 if (consumed)
3692 *consumed += done;
3693
3694#ifdef NEED_RETRY
3695 if (size > INT_MAX) {
3696 s += done;
3697 size -= done;
3698 goto retry;
3699 }
3700#endif
3701
3702 return (PyObject *)v;
3703}
3704
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003705PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003706 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003707 const char *errors)
3708{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003709 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3710}
3711
3712/*
3713 * Convert unicode into string object (MBCS).
3714 * Returns 0 if succeed, -1 otherwise.
3715 */
3716static int encode_mbcs(PyObject **repr,
3717 const Py_UNICODE *p, /* unicode */
3718 int size) /* size of unicode */
3719{
3720 int mbcssize = 0;
3721 Py_ssize_t n = 0;
3722
3723 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003724
3725 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003726 if (size > 0) {
3727 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3728 if (mbcssize == 0) {
3729 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3730 return -1;
3731 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003732 }
3733
Martin v. Löwisd8251432006-06-14 05:21:04 +00003734 if (*repr == NULL) {
3735 /* Create string object */
3736 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3737 if (*repr == NULL)
3738 return -1;
3739 }
3740 else {
3741 /* Extend string object */
3742 n = PyString_Size(*repr);
3743 if (_PyString_Resize(repr, n + mbcssize) < 0)
3744 return -1;
3745 }
3746
3747 /* Do the conversion */
3748 if (size > 0) {
3749 char *s = PyString_AS_STRING(*repr) + n;
3750 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3751 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3752 return -1;
3753 }
3754 }
3755
3756 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003757}
3758
3759PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003760 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003761 const char *errors)
3762{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003763 PyObject *repr = NULL;
3764 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003765
Martin v. Löwisd8251432006-06-14 05:21:04 +00003766#ifdef NEED_RETRY
3767 retry:
3768 if (size > INT_MAX)
3769 ret = encode_mbcs(&repr, p, INT_MAX);
3770 else
3771#endif
3772 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003773
Martin v. Löwisd8251432006-06-14 05:21:04 +00003774 if (ret < 0) {
3775 Py_XDECREF(repr);
3776 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003777 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003778
3779#ifdef NEED_RETRY
3780 if (size > INT_MAX) {
3781 p += INT_MAX;
3782 size -= INT_MAX;
3783 goto retry;
3784 }
3785#endif
3786
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003787 return repr;
3788}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003789
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003790PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3791{
3792 if (!PyUnicode_Check(unicode)) {
3793 PyErr_BadArgument();
3794 return NULL;
3795 }
3796 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3797 PyUnicode_GET_SIZE(unicode),
3798 NULL);
3799}
3800
Martin v. Löwisd8251432006-06-14 05:21:04 +00003801#undef NEED_RETRY
3802
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003803#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003804
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805/* --- Character Mapping Codec -------------------------------------------- */
3806
Guido van Rossumd57fd912000-03-10 22:53:23 +00003807PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003808 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809 PyObject *mapping,
3810 const char *errors)
3811{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003812 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003813 Py_ssize_t startinpos;
3814 Py_ssize_t endinpos;
3815 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003816 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 PyUnicodeObject *v;
3818 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003819 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003820 PyObject *errorHandler = NULL;
3821 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003822 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003823 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003824
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825 /* Default to Latin-1 */
3826 if (mapping == NULL)
3827 return PyUnicode_DecodeLatin1(s, size, errors);
3828
3829 v = _PyUnicode_New(size);
3830 if (v == NULL)
3831 goto onError;
3832 if (size == 0)
3833 return (PyObject *)v;
3834 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003835 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003836 if (PyUnicode_CheckExact(mapping)) {
3837 mapstring = PyUnicode_AS_UNICODE(mapping);
3838 maplen = PyUnicode_GET_SIZE(mapping);
3839 while (s < e) {
3840 unsigned char ch = *s;
3841 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003842
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003843 if (ch < maplen)
3844 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003845
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003846 if (x == 0xfffe) {
3847 /* undefined mapping */
3848 outpos = p-PyUnicode_AS_UNICODE(v);
3849 startinpos = s-starts;
3850 endinpos = startinpos+1;
3851 if (unicode_decode_call_errorhandler(
3852 errors, &errorHandler,
3853 "charmap", "character maps to <undefined>",
3854 starts, size, &startinpos, &endinpos, &exc, &s,
3855 (PyObject **)&v, &outpos, &p)) {
3856 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003857 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003858 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003859 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003860 *p++ = x;
3861 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003862 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003863 }
3864 else {
3865 while (s < e) {
3866 unsigned char ch = *s;
3867 PyObject *w, *x;
3868
3869 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3870 w = PyInt_FromLong((long)ch);
3871 if (w == NULL)
3872 goto onError;
3873 x = PyObject_GetItem(mapping, w);
3874 Py_DECREF(w);
3875 if (x == NULL) {
3876 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3877 /* No mapping found means: mapping is undefined. */
3878 PyErr_Clear();
3879 x = Py_None;
3880 Py_INCREF(x);
3881 } else
3882 goto onError;
3883 }
3884
3885 /* Apply mapping */
3886 if (PyInt_Check(x)) {
3887 long value = PyInt_AS_LONG(x);
3888 if (value < 0 || value > 65535) {
3889 PyErr_SetString(PyExc_TypeError,
3890 "character mapping must be in range(65536)");
3891 Py_DECREF(x);
3892 goto onError;
3893 }
3894 *p++ = (Py_UNICODE)value;
3895 }
3896 else if (x == Py_None) {
3897 /* undefined mapping */
3898 outpos = p-PyUnicode_AS_UNICODE(v);
3899 startinpos = s-starts;
3900 endinpos = startinpos+1;
3901 if (unicode_decode_call_errorhandler(
3902 errors, &errorHandler,
3903 "charmap", "character maps to <undefined>",
3904 starts, size, &startinpos, &endinpos, &exc, &s,
3905 (PyObject **)&v, &outpos, &p)) {
3906 Py_DECREF(x);
3907 goto onError;
3908 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003909 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003910 continue;
3911 }
3912 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003913 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003914
3915 if (targetsize == 1)
3916 /* 1-1 mapping */
3917 *p++ = *PyUnicode_AS_UNICODE(x);
3918
3919 else if (targetsize > 1) {
3920 /* 1-n mapping */
3921 if (targetsize > extrachars) {
3922 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003923 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3924 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003925 (targetsize << 2);
3926 extrachars += needed;
Armin Rigo7ccbca92006-10-04 12:17:45 +00003927 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003928 if (_PyUnicode_Resize(&v,
3929 PyUnicode_GET_SIZE(v) + needed) < 0) {
3930 Py_DECREF(x);
3931 goto onError;
3932 }
3933 p = PyUnicode_AS_UNICODE(v) + oldpos;
3934 }
3935 Py_UNICODE_COPY(p,
3936 PyUnicode_AS_UNICODE(x),
3937 targetsize);
3938 p += targetsize;
3939 extrachars -= targetsize;
3940 }
3941 /* 1-0 mapping: skip the character */
3942 }
3943 else {
3944 /* wrong return value */
3945 PyErr_SetString(PyExc_TypeError,
3946 "character mapping must return integer, None or unicode");
3947 Py_DECREF(x);
3948 goto onError;
3949 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003951 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003952 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 }
3954 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003955 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003957 Py_XDECREF(errorHandler);
3958 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003960
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962 Py_XDECREF(errorHandler);
3963 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 Py_XDECREF(v);
3965 return NULL;
3966}
3967
Martin v. Löwis3f767792006-06-04 19:36:28 +00003968/* Charmap encoding: the lookup table */
3969
3970struct encoding_map{
3971 PyObject_HEAD
3972 unsigned char level1[32];
3973 int count2, count3;
3974 unsigned char level23[1];
3975};
3976
3977static PyObject*
3978encoding_map_size(PyObject *obj, PyObject* args)
3979{
3980 struct encoding_map *map = (struct encoding_map*)obj;
3981 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3982 128*map->count3);
3983}
3984
3985static PyMethodDef encoding_map_methods[] = {
3986 {"size", encoding_map_size, METH_NOARGS,
3987 PyDoc_STR("Return the size (in bytes) of this object") },
3988 { 0 }
3989};
3990
3991static void
3992encoding_map_dealloc(PyObject* o)
3993{
3994 PyObject_FREE(o);
3995}
3996
3997static PyTypeObject EncodingMapType = {
Martin v. Löwis68192102007-07-21 06:55:02 +00003998 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003999 "EncodingMap", /*tp_name*/
4000 sizeof(struct encoding_map), /*tp_basicsize*/
4001 0, /*tp_itemsize*/
4002 /* methods */
4003 encoding_map_dealloc, /*tp_dealloc*/
4004 0, /*tp_print*/
4005 0, /*tp_getattr*/
4006 0, /*tp_setattr*/
4007 0, /*tp_compare*/
4008 0, /*tp_repr*/
4009 0, /*tp_as_number*/
4010 0, /*tp_as_sequence*/
4011 0, /*tp_as_mapping*/
4012 0, /*tp_hash*/
4013 0, /*tp_call*/
4014 0, /*tp_str*/
4015 0, /*tp_getattro*/
4016 0, /*tp_setattro*/
4017 0, /*tp_as_buffer*/
4018 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4019 0, /*tp_doc*/
4020 0, /*tp_traverse*/
4021 0, /*tp_clear*/
4022 0, /*tp_richcompare*/
4023 0, /*tp_weaklistoffset*/
4024 0, /*tp_iter*/
4025 0, /*tp_iternext*/
4026 encoding_map_methods, /*tp_methods*/
4027 0, /*tp_members*/
4028 0, /*tp_getset*/
4029 0, /*tp_base*/
4030 0, /*tp_dict*/
4031 0, /*tp_descr_get*/
4032 0, /*tp_descr_set*/
4033 0, /*tp_dictoffset*/
4034 0, /*tp_init*/
4035 0, /*tp_alloc*/
4036 0, /*tp_new*/
4037 0, /*tp_free*/
4038 0, /*tp_is_gc*/
4039};
4040
4041PyObject*
4042PyUnicode_BuildEncodingMap(PyObject* string)
4043{
4044 Py_UNICODE *decode;
4045 PyObject *result;
4046 struct encoding_map *mresult;
4047 int i;
4048 int need_dict = 0;
4049 unsigned char level1[32];
4050 unsigned char level2[512];
4051 unsigned char *mlevel1, *mlevel2, *mlevel3;
4052 int count2 = 0, count3 = 0;
4053
4054 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4055 PyErr_BadArgument();
4056 return NULL;
4057 }
4058 decode = PyUnicode_AS_UNICODE(string);
4059 memset(level1, 0xFF, sizeof level1);
4060 memset(level2, 0xFF, sizeof level2);
4061
4062 /* If there isn't a one-to-one mapping of NULL to \0,
4063 or if there are non-BMP characters, we need to use
4064 a mapping dictionary. */
4065 if (decode[0] != 0)
4066 need_dict = 1;
4067 for (i = 1; i < 256; i++) {
4068 int l1, l2;
4069 if (decode[i] == 0
4070 #ifdef Py_UNICODE_WIDE
4071 || decode[i] > 0xFFFF
4072 #endif
4073 ) {
4074 need_dict = 1;
4075 break;
4076 }
4077 if (decode[i] == 0xFFFE)
4078 /* unmapped character */
4079 continue;
4080 l1 = decode[i] >> 11;
4081 l2 = decode[i] >> 7;
4082 if (level1[l1] == 0xFF)
4083 level1[l1] = count2++;
4084 if (level2[l2] == 0xFF)
4085 level2[l2] = count3++;
4086 }
4087
4088 if (count2 >= 0xFF || count3 >= 0xFF)
4089 need_dict = 1;
4090
4091 if (need_dict) {
4092 PyObject *result = PyDict_New();
4093 PyObject *key, *value;
4094 if (!result)
4095 return NULL;
4096 for (i = 0; i < 256; i++) {
4097 key = value = NULL;
4098 key = PyInt_FromLong(decode[i]);
4099 value = PyInt_FromLong(i);
4100 if (!key || !value)
4101 goto failed1;
4102 if (PyDict_SetItem(result, key, value) == -1)
4103 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004104 Py_DECREF(key);
4105 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004106 }
4107 return result;
4108 failed1:
4109 Py_XDECREF(key);
4110 Py_XDECREF(value);
4111 Py_DECREF(result);
4112 return NULL;
4113 }
4114
4115 /* Create a three-level trie */
4116 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4117 16*count2 + 128*count3 - 1);
4118 if (!result)
4119 return PyErr_NoMemory();
4120 PyObject_Init(result, &EncodingMapType);
4121 mresult = (struct encoding_map*)result;
4122 mresult->count2 = count2;
4123 mresult->count3 = count3;
4124 mlevel1 = mresult->level1;
4125 mlevel2 = mresult->level23;
4126 mlevel3 = mresult->level23 + 16*count2;
4127 memcpy(mlevel1, level1, 32);
4128 memset(mlevel2, 0xFF, 16*count2);
4129 memset(mlevel3, 0, 128*count3);
4130 count3 = 0;
4131 for (i = 1; i < 256; i++) {
4132 int o1, o2, o3, i2, i3;
4133 if (decode[i] == 0xFFFE)
4134 /* unmapped character */
4135 continue;
4136 o1 = decode[i]>>11;
4137 o2 = (decode[i]>>7) & 0xF;
4138 i2 = 16*mlevel1[o1] + o2;
4139 if (mlevel2[i2] == 0xFF)
4140 mlevel2[i2] = count3++;
4141 o3 = decode[i] & 0x7F;
4142 i3 = 128*mlevel2[i2] + o3;
4143 mlevel3[i3] = i;
4144 }
4145 return result;
4146}
4147
4148static int
4149encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4150{
4151 struct encoding_map *map = (struct encoding_map*)mapping;
4152 int l1 = c>>11;
4153 int l2 = (c>>7) & 0xF;
4154 int l3 = c & 0x7F;
4155 int i;
4156
4157#ifdef Py_UNICODE_WIDE
4158 if (c > 0xFFFF) {
4159 return -1;
4160 }
4161#endif
4162 if (c == 0)
4163 return 0;
4164 /* level 1*/
4165 i = map->level1[l1];
4166 if (i == 0xFF) {
4167 return -1;
4168 }
4169 /* level 2*/
4170 i = map->level23[16*i+l2];
4171 if (i == 0xFF) {
4172 return -1;
4173 }
4174 /* level 3 */
4175 i = map->level23[16*map->count2 + 128*i + l3];
4176 if (i == 0) {
4177 return -1;
4178 }
4179 return i;
4180}
4181
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004182/* Lookup the character ch in the mapping. If the character
4183 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004184 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004187 PyObject *w = PyInt_FromLong((long)c);
4188 PyObject *x;
4189
4190 if (w == NULL)
4191 return NULL;
4192 x = PyObject_GetItem(mapping, w);
4193 Py_DECREF(w);
4194 if (x == NULL) {
4195 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4196 /* No mapping found means: mapping is undefined. */
4197 PyErr_Clear();
4198 x = Py_None;
4199 Py_INCREF(x);
4200 return x;
4201 } else
4202 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004203 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004204 else if (x == Py_None)
4205 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 else if (PyInt_Check(x)) {
4207 long value = PyInt_AS_LONG(x);
4208 if (value < 0 || value > 255) {
4209 PyErr_SetString(PyExc_TypeError,
4210 "character mapping must be in range(256)");
4211 Py_DECREF(x);
4212 return NULL;
4213 }
4214 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004215 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004216 else if (PyString_Check(x))
4217 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219 /* wrong return value */
4220 PyErr_SetString(PyExc_TypeError,
4221 "character mapping must return integer, None or str");
4222 Py_DECREF(x);
4223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004224 }
4225}
4226
Martin v. Löwis3f767792006-06-04 19:36:28 +00004227static int
4228charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4229{
4230 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4231 /* exponentially overallocate to minimize reallocations */
4232 if (requiredsize < 2*outsize)
4233 requiredsize = 2*outsize;
4234 if (_PyString_Resize(outobj, requiredsize)) {
4235 return 0;
4236 }
4237 return 1;
4238}
4239
4240typedef enum charmapencode_result {
4241 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4242}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004243/* lookup the character, put the result in the output string and adjust
4244 various state variables. Reallocate the output string if not enough
4245 space is available. Return a new reference to the object that
4246 was put in the output buffer, or Py_None, if the mapping was undefined
4247 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004248 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004249static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004250charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004251 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004253 PyObject *rep;
4254 char *outstart;
4255 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256
Christian Heimese93237d2007-12-19 02:37:44 +00004257 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004258 int res = encoding_map_lookup(c, mapping);
4259 Py_ssize_t requiredsize = *outpos+1;
4260 if (res == -1)
4261 return enc_FAILED;
4262 if (outsize<requiredsize)
4263 if (!charmapencode_resize(outobj, outpos, requiredsize))
4264 return enc_EXCEPTION;
4265 outstart = PyString_AS_STRING(*outobj);
4266 outstart[(*outpos)++] = (char)res;
4267 return enc_SUCCESS;
4268 }
4269
4270 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004272 return enc_EXCEPTION;
4273 else if (rep==Py_None) {
4274 Py_DECREF(rep);
4275 return enc_FAILED;
4276 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004277 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004278 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004279 if (outsize<requiredsize)
4280 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004281 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004282 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004283 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004284 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4286 }
4287 else {
4288 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004289 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4290 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004291 if (outsize<requiredsize)
4292 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004293 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004294 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004296 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004297 memcpy(outstart + *outpos, repchars, repsize);
4298 *outpos += repsize;
4299 }
4300 }
Georg Brandl9f167602006-06-04 21:46:16 +00004301 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004302 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303}
4304
4305/* handle an error in PyUnicode_EncodeCharmap
4306 Return 0 on success, -1 on error */
4307static
4308int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004309 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004310 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004311 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004312 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004313{
4314 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004315 Py_ssize_t repsize;
4316 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004317 Py_UNICODE *uni2;
4318 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004319 Py_ssize_t collstartpos = *inpos;
4320 Py_ssize_t collendpos = *inpos+1;
4321 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004322 char *encoding = "charmap";
4323 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004324 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004325
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004326 /* find all unencodable characters */
4327 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004328 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004329 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004330 int res = encoding_map_lookup(p[collendpos], mapping);
4331 if (res != -1)
4332 break;
4333 ++collendpos;
4334 continue;
4335 }
4336
4337 rep = charmapencode_lookup(p[collendpos], mapping);
4338 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004339 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004340 else if (rep!=Py_None) {
4341 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004342 break;
4343 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004344 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004345 ++collendpos;
4346 }
4347 /* cache callback name lookup
4348 * (if not done yet, i.e. it's the first error) */
4349 if (*known_errorHandler==-1) {
4350 if ((errors==NULL) || (!strcmp(errors, "strict")))
4351 *known_errorHandler = 1;
4352 else if (!strcmp(errors, "replace"))
4353 *known_errorHandler = 2;
4354 else if (!strcmp(errors, "ignore"))
4355 *known_errorHandler = 3;
4356 else if (!strcmp(errors, "xmlcharrefreplace"))
4357 *known_errorHandler = 4;
4358 else
4359 *known_errorHandler = 0;
4360 }
4361 switch (*known_errorHandler) {
4362 case 1: /* strict */
4363 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4364 return -1;
4365 case 2: /* replace */
4366 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4367 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004368 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004369 return -1;
4370 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004371 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004372 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4373 return -1;
4374 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004375 }
4376 /* fall through */
4377 case 3: /* ignore */
4378 *inpos = collendpos;
4379 break;
4380 case 4: /* xmlcharrefreplace */
4381 /* generate replacement (temporarily (mis)uses p) */
4382 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4383 char buffer[2+29+1+1];
4384 char *cp;
4385 sprintf(buffer, "&#%d;", (int)p[collpos]);
4386 for (cp = buffer; *cp; ++cp) {
4387 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004388 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004390 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4392 return -1;
4393 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 }
4395 }
4396 *inpos = collendpos;
4397 break;
4398 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004399 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 encoding, reason, p, size, exceptionObject,
4401 collstartpos, collendpos, &newpos);
4402 if (repunicode == NULL)
4403 return -1;
4404 /* generate replacement */
4405 repsize = PyUnicode_GET_SIZE(repunicode);
4406 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4407 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004408 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 return -1;
4410 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004411 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4414 return -1;
4415 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004416 }
4417 *inpos = newpos;
4418 Py_DECREF(repunicode);
4419 }
4420 return 0;
4421}
4422
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004424 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004425 PyObject *mapping,
4426 const char *errors)
4427{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428 /* output object */
4429 PyObject *res = NULL;
4430 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004431 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004432 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004433 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 PyObject *errorHandler = NULL;
4435 PyObject *exc = NULL;
4436 /* the following variable is used for caching string comparisons
4437 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4438 * 3=ignore, 4=xmlcharrefreplace */
4439 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440
4441 /* Default to Latin-1 */
4442 if (mapping == NULL)
4443 return PyUnicode_EncodeLatin1(p, size, errors);
4444
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004445 /* allocate enough for a simple encoding without
4446 replacements, if we need more, we'll resize */
4447 res = PyString_FromStringAndSize(NULL, size);
4448 if (res == NULL)
4449 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004450 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453 while (inpos<size) {
4454 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00004455 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4456 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004458 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 if (charmap_encoding_error(p, size, &inpos, mapping,
4460 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004461 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00004462 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004463 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004464 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004466 else
4467 /* done with this character => adjust input position */
4468 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004471 /* Resize if we allocated to much */
4472 if (respos<PyString_GET_SIZE(res)) {
4473 if (_PyString_Resize(&res, respos))
4474 goto onError;
4475 }
4476 Py_XDECREF(exc);
4477 Py_XDECREF(errorHandler);
4478 return res;
4479
4480 onError:
4481 Py_XDECREF(res);
4482 Py_XDECREF(exc);
4483 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484 return NULL;
4485}
4486
4487PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4488 PyObject *mapping)
4489{
4490 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4491 PyErr_BadArgument();
4492 return NULL;
4493 }
4494 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4495 PyUnicode_GET_SIZE(unicode),
4496 mapping,
4497 NULL);
4498}
4499
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004500/* create or adjust a UnicodeTranslateError */
4501static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004502 const Py_UNICODE *unicode, Py_ssize_t size,
4503 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004505{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506 if (*exceptionObject == NULL) {
4507 *exceptionObject = PyUnicodeTranslateError_Create(
4508 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509 }
4510 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4512 goto onError;
4513 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4514 goto onError;
4515 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4516 goto onError;
4517 return;
4518 onError:
4519 Py_DECREF(*exceptionObject);
4520 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521 }
4522}
4523
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524/* raises a UnicodeTranslateError */
4525static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004526 const Py_UNICODE *unicode, Py_ssize_t size,
4527 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 const char *reason)
4529{
4530 make_translate_exception(exceptionObject,
4531 unicode, size, startpos, endpos, reason);
4532 if (*exceptionObject != NULL)
4533 PyCodec_StrictErrors(*exceptionObject);
4534}
4535
4536/* error handling callback helper:
4537 build arguments, call the callback and check the arguments,
4538 put the result into newpos and return the replacement string, which
4539 has to be freed by the caller */
4540static PyObject *unicode_translate_call_errorhandler(const char *errors,
4541 PyObject **errorHandler,
4542 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004543 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4544 Py_ssize_t startpos, Py_ssize_t endpos,
4545 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004547 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548
Martin v. Löwis412fb672006-04-13 06:34:32 +00004549 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550 PyObject *restuple;
4551 PyObject *resunicode;
4552
4553 if (*errorHandler == NULL) {
4554 *errorHandler = PyCodec_LookupError(errors);
4555 if (*errorHandler == NULL)
4556 return NULL;
4557 }
4558
4559 make_translate_exception(exceptionObject,
4560 unicode, size, startpos, endpos, reason);
4561 if (*exceptionObject == NULL)
4562 return NULL;
4563
4564 restuple = PyObject_CallFunctionObjArgs(
4565 *errorHandler, *exceptionObject, NULL);
4566 if (restuple == NULL)
4567 return NULL;
4568 if (!PyTuple_Check(restuple)) {
4569 PyErr_Format(PyExc_TypeError, &argparse[4]);
4570 Py_DECREF(restuple);
4571 return NULL;
4572 }
4573 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004574 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004575 Py_DECREF(restuple);
4576 return NULL;
4577 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004578 if (i_newpos<0)
4579 *newpos = size+i_newpos;
4580 else
4581 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004582 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004583 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004584 Py_DECREF(restuple);
4585 return NULL;
4586 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587 Py_INCREF(resunicode);
4588 Py_DECREF(restuple);
4589 return resunicode;
4590}
4591
4592/* Lookup the character ch in the mapping and put the result in result,
4593 which must be decrefed by the caller.
4594 Return 0 on success, -1 on error */
4595static
4596int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4597{
4598 PyObject *w = PyInt_FromLong((long)c);
4599 PyObject *x;
4600
4601 if (w == NULL)
4602 return -1;
4603 x = PyObject_GetItem(mapping, w);
4604 Py_DECREF(w);
4605 if (x == NULL) {
4606 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4607 /* No mapping found means: use 1:1 mapping. */
4608 PyErr_Clear();
4609 *result = NULL;
4610 return 0;
4611 } else
4612 return -1;
4613 }
4614 else if (x == Py_None) {
4615 *result = x;
4616 return 0;
4617 }
4618 else if (PyInt_Check(x)) {
4619 long value = PyInt_AS_LONG(x);
4620 long max = PyUnicode_GetMax();
4621 if (value < 0 || value > max) {
4622 PyErr_Format(PyExc_TypeError,
4623 "character mapping must be in range(0x%lx)", max+1);
4624 Py_DECREF(x);
4625 return -1;
4626 }
4627 *result = x;
4628 return 0;
4629 }
4630 else if (PyUnicode_Check(x)) {
4631 *result = x;
4632 return 0;
4633 }
4634 else {
4635 /* wrong return value */
4636 PyErr_SetString(PyExc_TypeError,
4637 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004638 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004639 return -1;
4640 }
4641}
4642/* ensure that *outobj is at least requiredsize characters long,
4643if not reallocate and adjust various state variables.
4644Return 0 on success, -1 on error */
4645static
Walter Dörwald4894c302003-10-24 14:25:28 +00004646int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004647 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004648{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004649 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004650 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004651 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004652 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004653 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004654 if (requiredsize < 2 * oldsize)
4655 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004656 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657 return -1;
4658 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 }
4660 return 0;
4661}
4662/* lookup the character, put the result in the output string and adjust
4663 various state variables. Return a new reference to the object that
4664 was put in the output buffer in *result, or Py_None, if the mapping was
4665 undefined (in which case no character was written).
4666 The called must decref result.
4667 Return 0 on success, -1 on error. */
4668static
Walter Dörwald4894c302003-10-24 14:25:28 +00004669int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004670 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004671 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004672{
Walter Dörwald4894c302003-10-24 14:25:28 +00004673 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004674 return -1;
4675 if (*res==NULL) {
4676 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004677 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004678 }
4679 else if (*res==Py_None)
4680 ;
4681 else if (PyInt_Check(*res)) {
4682 /* no overflow check, because we know that the space is enough */
4683 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4684 }
4685 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004686 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004687 if (repsize==1) {
4688 /* no overflow check, because we know that the space is enough */
4689 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4690 }
4691 else if (repsize!=0) {
4692 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004693 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004694 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004695 repsize - 1;
4696 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004697 return -1;
4698 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4699 *outp += repsize;
4700 }
4701 }
4702 else
4703 return -1;
4704 return 0;
4705}
4706
4707PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004708 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709 PyObject *mapping,
4710 const char *errors)
4711{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004712 /* output object */
4713 PyObject *res = NULL;
4714 /* pointers to the beginning and end+1 of input */
4715 const Py_UNICODE *startp = p;
4716 const Py_UNICODE *endp = p + size;
4717 /* pointer into the output */
4718 Py_UNICODE *str;
4719 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004720 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004721 char *reason = "character maps to <undefined>";
4722 PyObject *errorHandler = NULL;
4723 PyObject *exc = NULL;
4724 /* the following variable is used for caching string comparisons
4725 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4726 * 3=ignore, 4=xmlcharrefreplace */
4727 int known_errorHandler = -1;
4728
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729 if (mapping == NULL) {
4730 PyErr_BadArgument();
4731 return NULL;
4732 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733
4734 /* allocate enough for a simple 1:1 translation without
4735 replacements, if we need more, we'll resize */
4736 res = PyUnicode_FromUnicode(NULL, size);
4737 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004738 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004740 return res;
4741 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004743 while (p<endp) {
4744 /* try to encode it */
4745 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004746 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004747 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 goto onError;
4749 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004750 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004751 if (x!=Py_None) /* it worked => adjust input pointer */
4752 ++p;
4753 else { /* untranslatable character */
4754 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004755 Py_ssize_t repsize;
4756 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004757 Py_UNICODE *uni2;
4758 /* startpos for collecting untranslatable chars */
4759 const Py_UNICODE *collstart = p;
4760 const Py_UNICODE *collend = p+1;
4761 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004763 /* find all untranslatable characters */
4764 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004765 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004766 goto onError;
4767 Py_XDECREF(x);
4768 if (x!=Py_None)
4769 break;
4770 ++collend;
4771 }
4772 /* cache callback name lookup
4773 * (if not done yet, i.e. it's the first error) */
4774 if (known_errorHandler==-1) {
4775 if ((errors==NULL) || (!strcmp(errors, "strict")))
4776 known_errorHandler = 1;
4777 else if (!strcmp(errors, "replace"))
4778 known_errorHandler = 2;
4779 else if (!strcmp(errors, "ignore"))
4780 known_errorHandler = 3;
4781 else if (!strcmp(errors, "xmlcharrefreplace"))
4782 known_errorHandler = 4;
4783 else
4784 known_errorHandler = 0;
4785 }
4786 switch (known_errorHandler) {
4787 case 1: /* strict */
4788 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4789 goto onError;
4790 case 2: /* replace */
4791 /* No need to check for space, this is a 1:1 replacement */
4792 for (coll = collstart; coll<collend; ++coll)
4793 *str++ = '?';
4794 /* fall through */
4795 case 3: /* ignore */
4796 p = collend;
4797 break;
4798 case 4: /* xmlcharrefreplace */
4799 /* generate replacement (temporarily (mis)uses p) */
4800 for (p = collstart; p < collend; ++p) {
4801 char buffer[2+29+1+1];
4802 char *cp;
4803 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004804 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4806 goto onError;
4807 for (cp = buffer; *cp; ++cp)
4808 *str++ = *cp;
4809 }
4810 p = collend;
4811 break;
4812 default:
4813 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4814 reason, startp, size, &exc,
4815 collstart-startp, collend-startp, &newpos);
4816 if (repunicode == NULL)
4817 goto onError;
4818 /* generate replacement */
4819 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004820 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004821 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4822 Py_DECREF(repunicode);
4823 goto onError;
4824 }
4825 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4826 *str++ = *uni2;
4827 p = startp + newpos;
4828 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 }
4830 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004832 /* Resize if we allocated to much */
4833 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004834 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004835 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004836 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004837 }
4838 Py_XDECREF(exc);
4839 Py_XDECREF(errorHandler);
4840 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004842 onError:
4843 Py_XDECREF(res);
4844 Py_XDECREF(exc);
4845 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 return NULL;
4847}
4848
4849PyObject *PyUnicode_Translate(PyObject *str,
4850 PyObject *mapping,
4851 const char *errors)
4852{
4853 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004854
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 str = PyUnicode_FromObject(str);
4856 if (str == NULL)
4857 goto onError;
4858 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4859 PyUnicode_GET_SIZE(str),
4860 mapping,
4861 errors);
4862 Py_DECREF(str);
4863 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004864
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865 onError:
4866 Py_XDECREF(str);
4867 return NULL;
4868}
Tim Petersced69f82003-09-16 20:30:58 +00004869
Guido van Rossum9e896b32000-04-05 20:11:21 +00004870/* --- Decimal Encoder ---------------------------------------------------- */
4871
4872int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004873 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004874 char *output,
4875 const char *errors)
4876{
4877 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004878 PyObject *errorHandler = NULL;
4879 PyObject *exc = NULL;
4880 const char *encoding = "decimal";
4881 const char *reason = "invalid decimal Unicode string";
4882 /* the following variable is used for caching string comparisons
4883 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4884 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004885
4886 if (output == NULL) {
4887 PyErr_BadArgument();
4888 return -1;
4889 }
4890
4891 p = s;
4892 end = s + length;
4893 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004894 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004895 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004897 Py_ssize_t repsize;
4898 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004899 Py_UNICODE *uni2;
4900 Py_UNICODE *collstart;
4901 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004902
Guido van Rossum9e896b32000-04-05 20:11:21 +00004903 if (Py_UNICODE_ISSPACE(ch)) {
4904 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004905 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004906 continue;
4907 }
4908 decimal = Py_UNICODE_TODECIMAL(ch);
4909 if (decimal >= 0) {
4910 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004911 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004912 continue;
4913 }
Guido van Rossumba477042000-04-06 18:18:10 +00004914 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004915 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004916 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004917 continue;
4918 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004919 /* All other characters are considered unencodable */
4920 collstart = p;
4921 collend = p+1;
4922 while (collend < end) {
4923 if ((0 < *collend && *collend < 256) ||
4924 !Py_UNICODE_ISSPACE(*collend) ||
4925 Py_UNICODE_TODECIMAL(*collend))
4926 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004927 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004928 /* cache callback name lookup
4929 * (if not done yet, i.e. it's the first error) */
4930 if (known_errorHandler==-1) {
4931 if ((errors==NULL) || (!strcmp(errors, "strict")))
4932 known_errorHandler = 1;
4933 else if (!strcmp(errors, "replace"))
4934 known_errorHandler = 2;
4935 else if (!strcmp(errors, "ignore"))
4936 known_errorHandler = 3;
4937 else if (!strcmp(errors, "xmlcharrefreplace"))
4938 known_errorHandler = 4;
4939 else
4940 known_errorHandler = 0;
4941 }
4942 switch (known_errorHandler) {
4943 case 1: /* strict */
4944 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4945 goto onError;
4946 case 2: /* replace */
4947 for (p = collstart; p < collend; ++p)
4948 *output++ = '?';
4949 /* fall through */
4950 case 3: /* ignore */
4951 p = collend;
4952 break;
4953 case 4: /* xmlcharrefreplace */
4954 /* generate replacement (temporarily (mis)uses p) */
4955 for (p = collstart; p < collend; ++p)
4956 output += sprintf(output, "&#%d;", (int)*p);
4957 p = collend;
4958 break;
4959 default:
4960 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4961 encoding, reason, s, length, &exc,
4962 collstart-s, collend-s, &newpos);
4963 if (repunicode == NULL)
4964 goto onError;
4965 /* generate replacement */
4966 repsize = PyUnicode_GET_SIZE(repunicode);
4967 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4968 Py_UNICODE ch = *uni2;
4969 if (Py_UNICODE_ISSPACE(ch))
4970 *output++ = ' ';
4971 else {
4972 decimal = Py_UNICODE_TODECIMAL(ch);
4973 if (decimal >= 0)
4974 *output++ = '0' + decimal;
4975 else if (0 < ch && ch < 256)
4976 *output++ = (char)ch;
4977 else {
4978 Py_DECREF(repunicode);
4979 raise_encode_exception(&exc, encoding,
4980 s, length, collstart-s, collend-s, reason);
4981 goto onError;
4982 }
4983 }
4984 }
4985 p = s + newpos;
4986 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004987 }
4988 }
4989 /* 0-terminate the output string */
4990 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004991 Py_XDECREF(exc);
4992 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004993 return 0;
4994
4995 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004996 Py_XDECREF(exc);
4997 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004998 return -1;
4999}
5000
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001/* --- Helpers ------------------------------------------------------------ */
5002
Fredrik Lundha50d2012006-05-26 17:04:58 +00005003#define STRINGLIB_CHAR Py_UNICODE
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005004
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005005#define STRINGLIB_LEN PyUnicode_GET_SIZE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005006#define STRINGLIB_NEW PyUnicode_FromUnicode
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005007#define STRINGLIB_STR PyUnicode_AS_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005008
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005009Py_LOCAL_INLINE(int)
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00005010STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
5011{
Fredrik Lundh9c0e9c02006-05-26 18:24:15 +00005012 if (str[0] != other[0])
5013 return 1;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00005014 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
5015}
5016
Fredrik Lundhb9479482006-05-26 17:22:38 +00005017#define STRINGLIB_EMPTY unicode_empty
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005018#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005019
Fredrik Lundha50d2012006-05-26 17:04:58 +00005020#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005021
5022#include "stringlib/count.h"
5023#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005024#include "stringlib/partition.h"
5025
Fredrik Lundhc8162812006-05-26 19:33:03 +00005026/* helper macro to fixup start/end slice values */
5027#define FIX_START_END(obj) \
5028 if (start < 0) \
5029 start += (obj)->length; \
5030 if (start < 0) \
5031 start = 0; \
5032 if (end > (obj)->length) \
5033 end = (obj)->length; \
5034 if (end < 0) \
5035 end += (obj)->length; \
5036 if (end < 0) \
5037 end = 0;
5038
Martin v. Löwis18e16552006-02-15 17:27:45 +00005039Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005040 PyObject *substr,
5041 Py_ssize_t start,
5042 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005044 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005045 PyUnicodeObject* str_obj;
5046 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005047
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005048 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5049 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005051 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5052 if (!sub_obj) {
5053 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054 return -1;
5055 }
Tim Petersced69f82003-09-16 20:30:58 +00005056
Fredrik Lundhc8162812006-05-26 19:33:03 +00005057 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005058
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005059 result = stringlib_count(
5060 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5061 );
5062
5063 Py_DECREF(sub_obj);
5064 Py_DECREF(str_obj);
5065
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066 return result;
5067}
5068
Martin v. Löwis18e16552006-02-15 17:27:45 +00005069Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005070 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005071 Py_ssize_t start,
5072 Py_ssize_t end,
5073 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005075 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005076
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005077 str = PyUnicode_FromObject(str);
5078 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005079 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005080 sub = PyUnicode_FromObject(sub);
5081 if (!sub) {
5082 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005083 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084 }
Tim Petersced69f82003-09-16 20:30:58 +00005085
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005086 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005087 result = stringlib_find_slice(
5088 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5089 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5090 start, end
5091 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005092 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005093 result = stringlib_rfind_slice(
5094 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5095 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5096 start, end
5097 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005098
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005099 Py_DECREF(str);
5100 Py_DECREF(sub);
5101
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 return result;
5103}
5104
Tim Petersced69f82003-09-16 20:30:58 +00005105static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106int tailmatch(PyUnicodeObject *self,
5107 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005108 Py_ssize_t start,
5109 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 int direction)
5111{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112 if (substring->length == 0)
5113 return 1;
5114
Fredrik Lundhc8162812006-05-26 19:33:03 +00005115 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116
5117 end -= substring->length;
5118 if (end < start)
5119 return 0;
5120
5121 if (direction > 0) {
5122 if (Py_UNICODE_MATCH(self, end, substring))
5123 return 1;
5124 } else {
5125 if (Py_UNICODE_MATCH(self, start, substring))
5126 return 1;
5127 }
5128
5129 return 0;
5130}
5131
Martin v. Löwis18e16552006-02-15 17:27:45 +00005132Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005134 Py_ssize_t start,
5135 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 int direction)
5137{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005138 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005139
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140 str = PyUnicode_FromObject(str);
5141 if (str == NULL)
5142 return -1;
5143 substr = PyUnicode_FromObject(substr);
5144 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005145 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146 return -1;
5147 }
Tim Petersced69f82003-09-16 20:30:58 +00005148
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 result = tailmatch((PyUnicodeObject *)str,
5150 (PyUnicodeObject *)substr,
5151 start, end, direction);
5152 Py_DECREF(str);
5153 Py_DECREF(substr);
5154 return result;
5155}
5156
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157/* Apply fixfct filter to the Unicode object self and return a
5158 reference to the modified object */
5159
Tim Petersced69f82003-09-16 20:30:58 +00005160static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161PyObject *fixup(PyUnicodeObject *self,
5162 int (*fixfct)(PyUnicodeObject *s))
5163{
5164
5165 PyUnicodeObject *u;
5166
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005167 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168 if (u == NULL)
5169 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005170
5171 Py_UNICODE_COPY(u->str, self->str, self->length);
5172
Tim Peters7a29bd52001-09-12 03:03:31 +00005173 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 /* fixfct should return TRUE if it modified the buffer. If
5175 FALSE, return a reference to the original buffer instead
5176 (to save space, not time) */
5177 Py_INCREF(self);
5178 Py_DECREF(u);
5179 return (PyObject*) self;
5180 }
5181 return (PyObject*) u;
5182}
5183
Tim Petersced69f82003-09-16 20:30:58 +00005184static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185int fixupper(PyUnicodeObject *self)
5186{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005187 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 Py_UNICODE *s = self->str;
5189 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005190
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191 while (len-- > 0) {
5192 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005193
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 ch = Py_UNICODE_TOUPPER(*s);
5195 if (ch != *s) {
5196 status = 1;
5197 *s = ch;
5198 }
5199 s++;
5200 }
5201
5202 return status;
5203}
5204
Tim Petersced69f82003-09-16 20:30:58 +00005205static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206int fixlower(PyUnicodeObject *self)
5207{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005208 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209 Py_UNICODE *s = self->str;
5210 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005211
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212 while (len-- > 0) {
5213 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005214
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215 ch = Py_UNICODE_TOLOWER(*s);
5216 if (ch != *s) {
5217 status = 1;
5218 *s = ch;
5219 }
5220 s++;
5221 }
5222
5223 return status;
5224}
5225
Tim Petersced69f82003-09-16 20:30:58 +00005226static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227int fixswapcase(PyUnicodeObject *self)
5228{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005229 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230 Py_UNICODE *s = self->str;
5231 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005232
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233 while (len-- > 0) {
5234 if (Py_UNICODE_ISUPPER(*s)) {
5235 *s = Py_UNICODE_TOLOWER(*s);
5236 status = 1;
5237 } else if (Py_UNICODE_ISLOWER(*s)) {
5238 *s = Py_UNICODE_TOUPPER(*s);
5239 status = 1;
5240 }
5241 s++;
5242 }
5243
5244 return status;
5245}
5246
Tim Petersced69f82003-09-16 20:30:58 +00005247static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248int fixcapitalize(PyUnicodeObject *self)
5249{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005250 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005251 Py_UNICODE *s = self->str;
5252 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005253
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005254 if (len == 0)
5255 return 0;
5256 if (Py_UNICODE_ISLOWER(*s)) {
5257 *s = Py_UNICODE_TOUPPER(*s);
5258 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005260 s++;
5261 while (--len > 0) {
5262 if (Py_UNICODE_ISUPPER(*s)) {
5263 *s = Py_UNICODE_TOLOWER(*s);
5264 status = 1;
5265 }
5266 s++;
5267 }
5268 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269}
5270
5271static
5272int fixtitle(PyUnicodeObject *self)
5273{
5274 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5275 register Py_UNICODE *e;
5276 int previous_is_cased;
5277
5278 /* Shortcut for single character strings */
5279 if (PyUnicode_GET_SIZE(self) == 1) {
5280 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5281 if (*p != ch) {
5282 *p = ch;
5283 return 1;
5284 }
5285 else
5286 return 0;
5287 }
Tim Petersced69f82003-09-16 20:30:58 +00005288
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 e = p + PyUnicode_GET_SIZE(self);
5290 previous_is_cased = 0;
5291 for (; p < e; p++) {
5292 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005293
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294 if (previous_is_cased)
5295 *p = Py_UNICODE_TOLOWER(ch);
5296 else
5297 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005298
5299 if (Py_UNICODE_ISLOWER(ch) ||
5300 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 Py_UNICODE_ISTITLE(ch))
5302 previous_is_cased = 1;
5303 else
5304 previous_is_cased = 0;
5305 }
5306 return 1;
5307}
5308
Tim Peters8ce9f162004-08-27 01:49:32 +00005309PyObject *
5310PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311{
Tim Peters8ce9f162004-08-27 01:49:32 +00005312 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005313 const Py_UNICODE blank = ' ';
5314 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005315 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005316 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005317 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5318 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005319 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5320 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005321 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005322 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005323 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324
Tim Peters05eba1f2004-08-27 21:32:02 +00005325 fseq = PySequence_Fast(seq, "");
5326 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005327 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005328 }
5329
Tim Peters91879ab2004-08-27 22:35:44 +00005330 /* Grrrr. A codec may be invoked to convert str objects to
5331 * Unicode, and so it's possible to call back into Python code
5332 * during PyUnicode_FromObject(), and so it's possible for a sick
5333 * codec to change the size of fseq (if seq is a list). Therefore
5334 * we have to keep refetching the size -- can't assume seqlen
5335 * is invariant.
5336 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005337 seqlen = PySequence_Fast_GET_SIZE(fseq);
5338 /* If empty sequence, return u"". */
5339 if (seqlen == 0) {
5340 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5341 goto Done;
5342 }
5343 /* If singleton sequence with an exact Unicode, return that. */
5344 if (seqlen == 1) {
5345 item = PySequence_Fast_GET_ITEM(fseq, 0);
5346 if (PyUnicode_CheckExact(item)) {
5347 Py_INCREF(item);
5348 res = (PyUnicodeObject *)item;
5349 goto Done;
5350 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005351 }
5352
Tim Peters05eba1f2004-08-27 21:32:02 +00005353 /* At least two items to join, or one that isn't exact Unicode. */
5354 if (seqlen > 1) {
5355 /* Set up sep and seplen -- they're needed. */
5356 if (separator == NULL) {
5357 sep = &blank;
5358 seplen = 1;
5359 }
5360 else {
5361 internal_separator = PyUnicode_FromObject(separator);
5362 if (internal_separator == NULL)
5363 goto onError;
5364 sep = PyUnicode_AS_UNICODE(internal_separator);
5365 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005366 /* In case PyUnicode_FromObject() mutated seq. */
5367 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005368 }
5369 }
5370
5371 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005372 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005373 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005374 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005375 res_p = PyUnicode_AS_UNICODE(res);
5376 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005377
Tim Peters05eba1f2004-08-27 21:32:02 +00005378 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00005379 Py_ssize_t itemlen;
5380 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005381
5382 item = PySequence_Fast_GET_ITEM(fseq, i);
5383 /* Convert item to Unicode. */
5384 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5385 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00005386 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005387 " %.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00005388 i, Py_TYPE(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005389 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005390 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005391 item = PyUnicode_FromObject(item);
5392 if (item == NULL)
5393 goto onError;
5394 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005395
Tim Peters91879ab2004-08-27 22:35:44 +00005396 /* In case PyUnicode_FromObject() mutated seq. */
5397 seqlen = PySequence_Fast_GET_SIZE(fseq);
5398
Tim Peters8ce9f162004-08-27 01:49:32 +00005399 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005401 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005402 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005403 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005404 if (i < seqlen - 1) {
5405 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005406 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005407 goto Overflow;
5408 }
5409 if (new_res_used > res_alloc) {
5410 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005411 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005412 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00005413 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005414 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005415 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00005416 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005417 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005419 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005420 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005422
5423 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005424 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005425 res_p += itemlen;
5426 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00005427 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005428 res_p += seplen;
5429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005431 res_used = new_res_used;
5432 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005433
Tim Peters05eba1f2004-08-27 21:32:02 +00005434 /* Shrink res to match the used area; this probably can't fail,
5435 * but it's cheap to check.
5436 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005437 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005438 goto onError;
5439
5440 Done:
5441 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005442 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 return (PyObject *)res;
5444
Tim Peters8ce9f162004-08-27 01:49:32 +00005445 Overflow:
5446 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005447 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005448 Py_DECREF(item);
5449 /* fall through */
5450
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005452 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005453 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005454 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 return NULL;
5456}
5457
Tim Petersced69f82003-09-16 20:30:58 +00005458static
5459PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005460 Py_ssize_t left,
5461 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 Py_UNICODE fill)
5463{
5464 PyUnicodeObject *u;
5465
5466 if (left < 0)
5467 left = 0;
5468 if (right < 0)
5469 right = 0;
5470
Tim Peters7a29bd52001-09-12 03:03:31 +00005471 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 Py_INCREF(self);
5473 return self;
5474 }
5475
5476 u = _PyUnicode_New(left + self->length + right);
5477 if (u) {
5478 if (left)
5479 Py_UNICODE_FILL(u->str, fill, left);
5480 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5481 if (right)
5482 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5483 }
5484
5485 return u;
5486}
5487
5488#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005489 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490 if (!str) \
5491 goto onError; \
5492 if (PyList_Append(list, str)) { \
5493 Py_DECREF(str); \
5494 goto onError; \
5495 } \
5496 else \
5497 Py_DECREF(str);
5498
5499static
5500PyObject *split_whitespace(PyUnicodeObject *self,
5501 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005502 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005504 register Py_ssize_t i;
5505 register Py_ssize_t j;
5506 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507 PyObject *str;
5508
5509 for (i = j = 0; i < len; ) {
5510 /* find a token */
5511 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5512 i++;
5513 j = i;
5514 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5515 i++;
5516 if (j < i) {
5517 if (maxcount-- <= 0)
5518 break;
5519 SPLIT_APPEND(self->str, j, i);
5520 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5521 i++;
5522 j = i;
5523 }
5524 }
5525 if (j < len) {
5526 SPLIT_APPEND(self->str, j, len);
5527 }
5528 return list;
5529
5530 onError:
5531 Py_DECREF(list);
5532 return NULL;
5533}
5534
5535PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005536 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005538 register Py_ssize_t i;
5539 register Py_ssize_t j;
5540 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 PyObject *list;
5542 PyObject *str;
5543 Py_UNICODE *data;
5544
5545 string = PyUnicode_FromObject(string);
5546 if (string == NULL)
5547 return NULL;
5548 data = PyUnicode_AS_UNICODE(string);
5549 len = PyUnicode_GET_SIZE(string);
5550
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 list = PyList_New(0);
5552 if (!list)
5553 goto onError;
5554
5555 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005556 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005557
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005559 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561
5562 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005563 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564 if (i < len) {
5565 if (data[i] == '\r' && i + 1 < len &&
5566 data[i+1] == '\n')
5567 i += 2;
5568 else
5569 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005570 if (keepends)
5571 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 }
Guido van Rossum86662912000-04-11 15:38:46 +00005573 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 j = i;
5575 }
5576 if (j < len) {
5577 SPLIT_APPEND(data, j, len);
5578 }
5579
5580 Py_DECREF(string);
5581 return list;
5582
5583 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005584 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585 Py_DECREF(string);
5586 return NULL;
5587}
5588
Tim Petersced69f82003-09-16 20:30:58 +00005589static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590PyObject *split_char(PyUnicodeObject *self,
5591 PyObject *list,
5592 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005593 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005595 register Py_ssize_t i;
5596 register Py_ssize_t j;
5597 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598 PyObject *str;
5599
5600 for (i = j = 0; i < len; ) {
5601 if (self->str[i] == ch) {
5602 if (maxcount-- <= 0)
5603 break;
5604 SPLIT_APPEND(self->str, j, i);
5605 i = j = i + 1;
5606 } else
5607 i++;
5608 }
5609 if (j <= len) {
5610 SPLIT_APPEND(self->str, j, len);
5611 }
5612 return list;
5613
5614 onError:
5615 Py_DECREF(list);
5616 return NULL;
5617}
5618
Tim Petersced69f82003-09-16 20:30:58 +00005619static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620PyObject *split_substring(PyUnicodeObject *self,
5621 PyObject *list,
5622 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005623 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005625 register Py_ssize_t i;
5626 register Py_ssize_t j;
5627 Py_ssize_t len = self->length;
5628 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 PyObject *str;
5630
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005631 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 if (Py_UNICODE_MATCH(self, i, substring)) {
5633 if (maxcount-- <= 0)
5634 break;
5635 SPLIT_APPEND(self->str, j, i);
5636 i = j = i + sublen;
5637 } else
5638 i++;
5639 }
5640 if (j <= len) {
5641 SPLIT_APPEND(self->str, j, len);
5642 }
5643 return list;
5644
5645 onError:
5646 Py_DECREF(list);
5647 return NULL;
5648}
5649
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005650static
5651PyObject *rsplit_whitespace(PyUnicodeObject *self,
5652 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005653 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005654{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005655 register Py_ssize_t i;
5656 register Py_ssize_t j;
5657 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005658 PyObject *str;
5659
5660 for (i = j = len - 1; i >= 0; ) {
5661 /* find a token */
5662 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5663 i--;
5664 j = i;
5665 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5666 i--;
5667 if (j > i) {
5668 if (maxcount-- <= 0)
5669 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005670 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005671 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5672 i--;
5673 j = i;
5674 }
5675 }
5676 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005677 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005678 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005679 if (PyList_Reverse(list) < 0)
5680 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005681 return list;
5682
5683 onError:
5684 Py_DECREF(list);
5685 return NULL;
5686}
5687
5688static
5689PyObject *rsplit_char(PyUnicodeObject *self,
5690 PyObject *list,
5691 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005692 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005693{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005694 register Py_ssize_t i;
5695 register Py_ssize_t j;
5696 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005697 PyObject *str;
5698
5699 for (i = j = len - 1; i >= 0; ) {
5700 if (self->str[i] == ch) {
5701 if (maxcount-- <= 0)
5702 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005703 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005704 j = i = i - 1;
5705 } else
5706 i--;
5707 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005708 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005709 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005710 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005711 if (PyList_Reverse(list) < 0)
5712 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005713 return list;
5714
5715 onError:
5716 Py_DECREF(list);
5717 return NULL;
5718}
5719
5720static
5721PyObject *rsplit_substring(PyUnicodeObject *self,
5722 PyObject *list,
5723 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005724 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005725{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005726 register Py_ssize_t i;
5727 register Py_ssize_t j;
5728 Py_ssize_t len = self->length;
5729 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005730 PyObject *str;
5731
5732 for (i = len - sublen, j = len; i >= 0; ) {
5733 if (Py_UNICODE_MATCH(self, i, substring)) {
5734 if (maxcount-- <= 0)
5735 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005736 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005737 j = i;
5738 i -= sublen;
5739 } else
5740 i--;
5741 }
5742 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005743 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005744 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005745 if (PyList_Reverse(list) < 0)
5746 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005747 return list;
5748
5749 onError:
5750 Py_DECREF(list);
5751 return NULL;
5752}
5753
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754#undef SPLIT_APPEND
5755
5756static
5757PyObject *split(PyUnicodeObject *self,
5758 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005759 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760{
5761 PyObject *list;
5762
5763 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005764 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765
5766 list = PyList_New(0);
5767 if (!list)
5768 return NULL;
5769
5770 if (substring == NULL)
5771 return split_whitespace(self,list,maxcount);
5772
5773 else if (substring->length == 1)
5774 return split_char(self,list,substring->str[0],maxcount);
5775
5776 else if (substring->length == 0) {
5777 Py_DECREF(list);
5778 PyErr_SetString(PyExc_ValueError, "empty separator");
5779 return NULL;
5780 }
5781 else
5782 return split_substring(self,list,substring,maxcount);
5783}
5784
Tim Petersced69f82003-09-16 20:30:58 +00005785static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005786PyObject *rsplit(PyUnicodeObject *self,
5787 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005788 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005789{
5790 PyObject *list;
5791
5792 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005793 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005794
5795 list = PyList_New(0);
5796 if (!list)
5797 return NULL;
5798
5799 if (substring == NULL)
5800 return rsplit_whitespace(self,list,maxcount);
5801
5802 else if (substring->length == 1)
5803 return rsplit_char(self,list,substring->str[0],maxcount);
5804
5805 else if (substring->length == 0) {
5806 Py_DECREF(list);
5807 PyErr_SetString(PyExc_ValueError, "empty separator");
5808 return NULL;
5809 }
5810 else
5811 return rsplit_substring(self,list,substring,maxcount);
5812}
5813
5814static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815PyObject *replace(PyUnicodeObject *self,
5816 PyUnicodeObject *str1,
5817 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005818 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819{
5820 PyUnicodeObject *u;
5821
5822 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005823 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824
Fredrik Lundh347ee272006-05-24 16:35:18 +00005825 if (str1->length == str2->length) {
5826 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005827 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005828 if (str1->length == 1) {
5829 /* replace characters */
5830 Py_UNICODE u1, u2;
5831 if (!findchar(self->str, self->length, str1->str[0]))
5832 goto nothing;
5833 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5834 if (!u)
5835 return NULL;
5836 Py_UNICODE_COPY(u->str, self->str, self->length);
5837 u1 = str1->str[0];
5838 u2 = str2->str[0];
5839 for (i = 0; i < u->length; i++)
5840 if (u->str[i] == u1) {
5841 if (--maxcount < 0)
5842 break;
5843 u->str[i] = u2;
5844 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005846 i = fastsearch(
5847 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005849 if (i < 0)
5850 goto nothing;
5851 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5852 if (!u)
5853 return NULL;
5854 Py_UNICODE_COPY(u->str, self->str, self->length);
5855 while (i <= self->length - str1->length)
5856 if (Py_UNICODE_MATCH(self, i, str1)) {
5857 if (--maxcount < 0)
5858 break;
5859 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5860 i += str1->length;
5861 } else
5862 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005865
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005866 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005867 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 Py_UNICODE *p;
5869
5870 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005871 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 if (n > maxcount)
5873 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005874 if (n == 0)
5875 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005876 /* new_size = self->length + n * (str2->length - str1->length)); */
5877 delta = (str2->length - str1->length);
5878 if (delta == 0) {
5879 new_size = self->length;
5880 } else {
5881 product = n * (str2->length - str1->length);
5882 if ((product / (str2->length - str1->length)) != n) {
5883 PyErr_SetString(PyExc_OverflowError,
5884 "replace string is too long");
5885 return NULL;
5886 }
5887 new_size = self->length + product;
5888 if (new_size < 0) {
5889 PyErr_SetString(PyExc_OverflowError,
5890 "replace string is too long");
5891 return NULL;
5892 }
5893 }
5894 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005895 if (!u)
5896 return NULL;
5897 i = 0;
5898 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005899 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005900 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005901 while (n-- > 0) {
5902 /* look for next match */
5903 j = i;
5904 while (j <= e) {
5905 if (Py_UNICODE_MATCH(self, j, str1))
5906 break;
5907 j++;
5908 }
5909 if (j > i) {
5910 if (j > e)
5911 break;
5912 /* copy unchanged part [i:j] */
5913 Py_UNICODE_COPY(p, self->str+i, j-i);
5914 p += j - i;
5915 }
5916 /* copy substitution string */
5917 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005918 Py_UNICODE_COPY(p, str2->str, str2->length);
5919 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005920 }
5921 i = j + str1->length;
5922 }
5923 if (i < self->length)
5924 /* copy tail [i:] */
5925 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005926 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005927 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005928 while (n > 0) {
5929 Py_UNICODE_COPY(p, str2->str, str2->length);
5930 p += str2->length;
5931 if (--n <= 0)
5932 break;
5933 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005935 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 }
5937 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005939
5940nothing:
5941 /* nothing to replace; return original string (when possible) */
5942 if (PyUnicode_CheckExact(self)) {
5943 Py_INCREF(self);
5944 return (PyObject *) self;
5945 }
5946 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947}
5948
5949/* --- Unicode Object Methods --------------------------------------------- */
5950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005951PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952"S.title() -> unicode\n\
5953\n\
5954Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005955characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956
5957static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005958unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 return fixup(self, fixtitle);
5961}
5962
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005963PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964"S.capitalize() -> unicode\n\
5965\n\
5966Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005967have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968
5969static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005970unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 return fixup(self, fixcapitalize);
5973}
5974
5975#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005976PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977"S.capwords() -> unicode\n\
5978\n\
5979Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005980normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981
5982static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005983unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984{
5985 PyObject *list;
5986 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005987 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 /* Split into words */
5990 list = split(self, NULL, -1);
5991 if (!list)
5992 return NULL;
5993
5994 /* Capitalize each word */
5995 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5996 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5997 fixcapitalize);
5998 if (item == NULL)
5999 goto onError;
6000 Py_DECREF(PyList_GET_ITEM(list, i));
6001 PyList_SET_ITEM(list, i, item);
6002 }
6003
6004 /* Join the words to form a new string */
6005 item = PyUnicode_Join(NULL, list);
6006
6007onError:
6008 Py_DECREF(list);
6009 return (PyObject *)item;
6010}
6011#endif
6012
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006013/* Argument converter. Coerces to a single unicode character */
6014
6015static int
6016convert_uc(PyObject *obj, void *addr)
6017{
6018 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6019 PyObject *uniobj;
6020 Py_UNICODE *unistr;
6021
6022 uniobj = PyUnicode_FromObject(obj);
6023 if (uniobj == NULL) {
6024 PyErr_SetString(PyExc_TypeError,
6025 "The fill character cannot be converted to Unicode");
6026 return 0;
6027 }
6028 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6029 PyErr_SetString(PyExc_TypeError,
6030 "The fill character must be exactly one character long");
6031 Py_DECREF(uniobj);
6032 return 0;
6033 }
6034 unistr = PyUnicode_AS_UNICODE(uniobj);
6035 *fillcharloc = unistr[0];
6036 Py_DECREF(uniobj);
6037 return 1;
6038}
6039
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006040PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006041"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006043Return S centered in a Unicode string of length width. Padding is\n\
6044done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045
6046static PyObject *
6047unicode_center(PyUnicodeObject *self, PyObject *args)
6048{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006049 Py_ssize_t marg, left;
6050 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006051 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052
Thomas Woutersde017742006-02-16 19:34:37 +00006053 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 return NULL;
6055
Tim Peters7a29bd52001-09-12 03:03:31 +00006056 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 Py_INCREF(self);
6058 return (PyObject*) self;
6059 }
6060
6061 marg = width - self->length;
6062 left = marg / 2 + (marg & width & 1);
6063
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006064 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065}
6066
Marc-André Lemburge5034372000-08-08 08:04:29 +00006067#if 0
6068
6069/* This code should go into some future Unicode collation support
6070 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006071 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006072
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006073/* speedy UTF-16 code point order comparison */
6074/* gleaned from: */
6075/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6076
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006077static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006078{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006079 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006080 0, 0, 0, 0, 0, 0, 0, 0,
6081 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006082 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006083};
6084
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085static int
6086unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6087{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006088 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006089
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 Py_UNICODE *s1 = str1->str;
6091 Py_UNICODE *s2 = str2->str;
6092
6093 len1 = str1->length;
6094 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006095
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006097 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006098
6099 c1 = *s1++;
6100 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006101
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006102 if (c1 > (1<<11) * 26)
6103 c1 += utf16Fixup[c1>>11];
6104 if (c2 > (1<<11) * 26)
6105 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006106 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006107
6108 if (c1 != c2)
6109 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006110
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006111 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 }
6113
6114 return (len1 < len2) ? -1 : (len1 != len2);
6115}
6116
Marc-André Lemburge5034372000-08-08 08:04:29 +00006117#else
6118
6119static int
6120unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6121{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006122 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006123
6124 Py_UNICODE *s1 = str1->str;
6125 Py_UNICODE *s2 = str2->str;
6126
6127 len1 = str1->length;
6128 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006129
Marc-André Lemburge5034372000-08-08 08:04:29 +00006130 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006131 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006132
Fredrik Lundh45714e92001-06-26 16:39:36 +00006133 c1 = *s1++;
6134 c2 = *s2++;
6135
6136 if (c1 != c2)
6137 return (c1 < c2) ? -1 : 1;
6138
Marc-André Lemburge5034372000-08-08 08:04:29 +00006139 len1--; len2--;
6140 }
6141
6142 return (len1 < len2) ? -1 : (len1 != len2);
6143}
6144
6145#endif
6146
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147int PyUnicode_Compare(PyObject *left,
6148 PyObject *right)
6149{
6150 PyUnicodeObject *u = NULL, *v = NULL;
6151 int result;
6152
6153 /* Coerce the two arguments */
6154 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6155 if (u == NULL)
6156 goto onError;
6157 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6158 if (v == NULL)
6159 goto onError;
6160
Thomas Wouters7e474022000-07-16 12:04:32 +00006161 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 if (v == u) {
6163 Py_DECREF(u);
6164 Py_DECREF(v);
6165 return 0;
6166 }
6167
6168 result = unicode_compare(u, v);
6169
6170 Py_DECREF(u);
6171 Py_DECREF(v);
6172 return result;
6173
6174onError:
6175 Py_XDECREF(u);
6176 Py_XDECREF(v);
6177 return -1;
6178}
6179
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006180PyObject *PyUnicode_RichCompare(PyObject *left,
6181 PyObject *right,
6182 int op)
6183{
6184 int result;
6185
6186 result = PyUnicode_Compare(left, right);
6187 if (result == -1 && PyErr_Occurred())
6188 goto onError;
6189
6190 /* Convert the return value to a Boolean */
6191 switch (op) {
6192 case Py_EQ:
6193 result = (result == 0);
6194 break;
6195 case Py_NE:
6196 result = (result != 0);
6197 break;
6198 case Py_LE:
6199 result = (result <= 0);
6200 break;
6201 case Py_GE:
6202 result = (result >= 0);
6203 break;
6204 case Py_LT:
6205 result = (result == -1);
6206 break;
6207 case Py_GT:
6208 result = (result == 1);
6209 break;
6210 }
6211 return PyBool_FromLong(result);
6212
6213 onError:
6214
6215 /* Standard case
6216
6217 Type errors mean that PyUnicode_FromObject() could not convert
6218 one of the arguments (usually the right hand side) to Unicode,
6219 ie. we can't handle the comparison request. However, it is
6220 possible that the other object knows a comparison method, which
6221 is why we return Py_NotImplemented to give the other object a
6222 chance.
6223
6224 */
6225 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6226 PyErr_Clear();
6227 Py_INCREF(Py_NotImplemented);
6228 return Py_NotImplemented;
6229 }
6230 if (op != Py_EQ && op != Py_NE)
6231 return NULL;
6232
6233 /* Equality comparison.
6234
6235 This is a special case: we silence any PyExc_UnicodeDecodeError
6236 and instead turn it into a PyErr_UnicodeWarning.
6237
6238 */
6239 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6240 return NULL;
6241 PyErr_Clear();
6242 if (PyErr_Warn(PyExc_UnicodeWarning,
6243 (op == Py_EQ) ?
6244 "Unicode equal comparison "
6245 "failed to convert both arguments to Unicode - "
6246 "interpreting them as being unequal" :
6247 "Unicode unequal comparison "
6248 "failed to convert both arguments to Unicode - "
6249 "interpreting them as being unequal"
6250 ) < 0)
6251 return NULL;
6252 result = (op == Py_NE);
6253 return PyBool_FromLong(result);
6254}
6255
Guido van Rossum403d68b2000-03-13 15:55:09 +00006256int PyUnicode_Contains(PyObject *container,
6257 PyObject *element)
6258{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006259 PyObject *str, *sub;
6260 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006261
6262 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006263 sub = PyUnicode_FromObject(element);
6264 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006265 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00006266 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006267 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006268 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006269
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006270 str = PyUnicode_FromObject(container);
6271 if (!str) {
6272 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006273 return -1;
6274 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006275
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006276 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006277
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006278 Py_DECREF(str);
6279 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006280
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006281 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006282}
6283
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284/* Concat to string or Unicode object giving a new Unicode object. */
6285
6286PyObject *PyUnicode_Concat(PyObject *left,
6287 PyObject *right)
6288{
6289 PyUnicodeObject *u = NULL, *v = NULL, *w;
6290
6291 /* Coerce the two arguments */
6292 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6293 if (u == NULL)
6294 goto onError;
6295 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6296 if (v == NULL)
6297 goto onError;
6298
6299 /* Shortcuts */
6300 if (v == unicode_empty) {
6301 Py_DECREF(v);
6302 return (PyObject *)u;
6303 }
6304 if (u == unicode_empty) {
6305 Py_DECREF(u);
6306 return (PyObject *)v;
6307 }
6308
6309 /* Concat the two Unicode strings */
6310 w = _PyUnicode_New(u->length + v->length);
6311 if (w == NULL)
6312 goto onError;
6313 Py_UNICODE_COPY(w->str, u->str, u->length);
6314 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6315
6316 Py_DECREF(u);
6317 Py_DECREF(v);
6318 return (PyObject *)w;
6319
6320onError:
6321 Py_XDECREF(u);
6322 Py_XDECREF(v);
6323 return NULL;
6324}
6325
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006326PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327"S.count(sub[, start[, end]]) -> int\n\
6328\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006329Return the number of non-overlapping occurrences of substring sub in\n\
6330Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006331interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332
6333static PyObject *
6334unicode_count(PyUnicodeObject *self, PyObject *args)
6335{
6336 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006337 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006338 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339 PyObject *result;
6340
Guido van Rossumb8872e62000-05-09 14:14:27 +00006341 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6342 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343 return NULL;
6344
6345 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006346 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347 if (substring == NULL)
6348 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006349
Fredrik Lundhc8162812006-05-26 19:33:03 +00006350 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006352 result = PyInt_FromSsize_t(
6353 stringlib_count(self->str + start, end - start,
6354 substring->str, substring->length)
6355 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356
6357 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006358
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359 return result;
6360}
6361
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006362PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006363"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006365Encodes S using the codec registered for encoding. encoding defaults\n\
6366to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006367handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006368a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6369'xmlcharrefreplace' as well as any other name registered with\n\
6370codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371
6372static PyObject *
6373unicode_encode(PyUnicodeObject *self, PyObject *args)
6374{
6375 char *encoding = NULL;
6376 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006377 PyObject *v;
6378
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6380 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006381 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006382 if (v == NULL)
6383 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006384 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6385 PyErr_Format(PyExc_TypeError,
6386 "encoder did not return a string/unicode object "
6387 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006388 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006389 Py_DECREF(v);
6390 return NULL;
6391 }
6392 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006393
6394 onError:
6395 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006396}
6397
6398PyDoc_STRVAR(decode__doc__,
6399"S.decode([encoding[,errors]]) -> string or unicode\n\
6400\n\
6401Decodes S using the codec registered for encoding. encoding defaults\n\
6402to the default encoding. errors may be given to set a different error\n\
6403handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6404a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6405as well as any other name registerd with codecs.register_error that is\n\
6406able to handle UnicodeDecodeErrors.");
6407
6408static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006409unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006410{
6411 char *encoding = NULL;
6412 char *errors = NULL;
6413 PyObject *v;
6414
6415 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6416 return NULL;
6417 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006418 if (v == NULL)
6419 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006420 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6421 PyErr_Format(PyExc_TypeError,
6422 "decoder did not return a string/unicode object "
6423 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006424 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006425 Py_DECREF(v);
6426 return NULL;
6427 }
6428 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006429
6430 onError:
6431 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432}
6433
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006434PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435"S.expandtabs([tabsize]) -> unicode\n\
6436\n\
6437Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006438If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439
6440static PyObject*
6441unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6442{
6443 Py_UNICODE *e;
6444 Py_UNICODE *p;
6445 Py_UNICODE *q;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006446 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 PyUnicodeObject *u;
6448 int tabsize = 8;
6449
6450 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6451 return NULL;
6452
Thomas Wouters7e474022000-07-16 12:04:32 +00006453 /* First pass: determine size of output string */
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006454 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 e = self->str + self->length;
6456 for (p = self->str; p < e; p++)
6457 if (*p == '\t') {
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006458 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 j += tabsize - (j % tabsize);
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006460 if (old_j > j) {
Neal Norwitz5c9a81a2007-06-11 02:16:10 +00006461 PyErr_SetString(PyExc_OverflowError,
6462 "new string is too long");
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006463 return NULL;
6464 }
6465 old_j = j;
6466 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 }
6468 else {
6469 j++;
6470 if (*p == '\n' || *p == '\r') {
6471 i += j;
Neal Norwitz5c9a81a2007-06-11 02:16:10 +00006472 old_j = j = 0;
6473 if (i < 0) {
6474 PyErr_SetString(PyExc_OverflowError,
6475 "new string is too long");
6476 return NULL;
6477 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 }
6479 }
6480
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006481 if ((i + j) < 0) {
6482 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6483 return NULL;
6484 }
6485
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 /* Second pass: create output string and fill it */
6487 u = _PyUnicode_New(i + j);
6488 if (!u)
6489 return NULL;
6490
6491 j = 0;
6492 q = u->str;
6493
6494 for (p = self->str; p < e; p++)
6495 if (*p == '\t') {
6496 if (tabsize > 0) {
6497 i = tabsize - (j % tabsize);
6498 j += i;
6499 while (i--)
6500 *q++ = ' ';
6501 }
6502 }
6503 else {
6504 j++;
6505 *q++ = *p;
6506 if (*p == '\n' || *p == '\r')
6507 j = 0;
6508 }
6509
6510 return (PyObject*) u;
6511}
6512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006513PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514"S.find(sub [,start [,end]]) -> int\n\
6515\n\
6516Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006517such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518arguments start and end are interpreted as in slice notation.\n\
6519\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006520Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521
6522static PyObject *
6523unicode_find(PyUnicodeObject *self, PyObject *args)
6524{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006525 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006526 Py_ssize_t start;
6527 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006528 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529
Facundo Batista57d56692007-11-16 18:04:14 +00006530 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006533 result = stringlib_find_slice(
6534 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6535 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6536 start, end
6537 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538
6539 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006540
6541 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542}
6543
6544static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006545unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546{
6547 if (index < 0 || index >= self->length) {
6548 PyErr_SetString(PyExc_IndexError, "string index out of range");
6549 return NULL;
6550 }
6551
6552 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6553}
6554
6555static long
6556unicode_hash(PyUnicodeObject *self)
6557{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006558 /* Since Unicode objects compare equal to their ASCII string
6559 counterparts, they should use the individual character values
6560 as basis for their hash value. This is needed to assure that
6561 strings and Unicode objects behave in the same way as
6562 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563
Martin v. Löwis18e16552006-02-15 17:27:45 +00006564 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006565 register Py_UNICODE *p;
6566 register long x;
6567
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 if (self->hash != -1)
6569 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006570 len = PyUnicode_GET_SIZE(self);
6571 p = PyUnicode_AS_UNICODE(self);
6572 x = *p << 7;
6573 while (--len >= 0)
6574 x = (1000003*x) ^ *p++;
6575 x ^= PyUnicode_GET_SIZE(self);
6576 if (x == -1)
6577 x = -2;
6578 self->hash = x;
6579 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580}
6581
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006582PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583"S.index(sub [,start [,end]]) -> int\n\
6584\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006585Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586
6587static PyObject *
6588unicode_index(PyUnicodeObject *self, PyObject *args)
6589{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006590 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006591 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006592 Py_ssize_t start;
6593 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594
Facundo Batista57d56692007-11-16 18:04:14 +00006595 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006598 result = stringlib_find_slice(
6599 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6600 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6601 start, end
6602 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603
6604 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006605
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 if (result < 0) {
6607 PyErr_SetString(PyExc_ValueError, "substring not found");
6608 return NULL;
6609 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006610
Martin v. Löwis18e16552006-02-15 17:27:45 +00006611 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612}
6613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006614PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006615"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006617Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006618at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619
6620static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006621unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622{
6623 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6624 register const Py_UNICODE *e;
6625 int cased;
6626
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 /* Shortcut for single character strings */
6628 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006629 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006631 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006632 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006633 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006634
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635 e = p + PyUnicode_GET_SIZE(self);
6636 cased = 0;
6637 for (; p < e; p++) {
6638 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006639
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006641 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 else if (!cased && Py_UNICODE_ISLOWER(ch))
6643 cased = 1;
6644 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006645 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646}
6647
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006648PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006649"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006651Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006652at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653
6654static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006655unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656{
6657 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6658 register const Py_UNICODE *e;
6659 int cased;
6660
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 /* Shortcut for single character strings */
6662 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006663 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006665 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006666 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006667 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006668
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669 e = p + PyUnicode_GET_SIZE(self);
6670 cased = 0;
6671 for (; p < e; p++) {
6672 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006673
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006675 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 else if (!cased && Py_UNICODE_ISUPPER(ch))
6677 cased = 1;
6678 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006679 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680}
6681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006682PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006683"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006685Return True if S is a titlecased string and there is at least one\n\
6686character in S, i.e. upper- and titlecase characters may only\n\
6687follow uncased characters and lowercase characters only cased ones.\n\
6688Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689
6690static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006691unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692{
6693 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6694 register const Py_UNICODE *e;
6695 int cased, previous_is_cased;
6696
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697 /* Shortcut for single character strings */
6698 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006699 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6700 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006702 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006703 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006704 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006705
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 e = p + PyUnicode_GET_SIZE(self);
6707 cased = 0;
6708 previous_is_cased = 0;
6709 for (; p < e; p++) {
6710 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006711
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6713 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006714 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715 previous_is_cased = 1;
6716 cased = 1;
6717 }
6718 else if (Py_UNICODE_ISLOWER(ch)) {
6719 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006720 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721 previous_is_cased = 1;
6722 cased = 1;
6723 }
6724 else
6725 previous_is_cased = 0;
6726 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006727 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728}
6729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006730PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006731"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006733Return True if all characters in S are whitespace\n\
6734and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735
6736static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006737unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738{
6739 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6740 register const Py_UNICODE *e;
6741
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742 /* Shortcut for single character strings */
6743 if (PyUnicode_GET_SIZE(self) == 1 &&
6744 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006745 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006747 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006748 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006749 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006750
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 e = p + PyUnicode_GET_SIZE(self);
6752 for (; p < e; p++) {
6753 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006754 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006756 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757}
6758
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006759PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006760"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006761\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006762Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006763and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006764
6765static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006766unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006767{
6768 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6769 register const Py_UNICODE *e;
6770
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006771 /* Shortcut for single character strings */
6772 if (PyUnicode_GET_SIZE(self) == 1 &&
6773 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006774 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006775
6776 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006777 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006778 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006779
6780 e = p + PyUnicode_GET_SIZE(self);
6781 for (; p < e; p++) {
6782 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006783 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006784 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006785 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006786}
6787
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006788PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006789"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006790\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006791Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006792and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006793
6794static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006795unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006796{
6797 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6798 register const Py_UNICODE *e;
6799
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006800 /* Shortcut for single character strings */
6801 if (PyUnicode_GET_SIZE(self) == 1 &&
6802 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006803 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006804
6805 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006806 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006807 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006808
6809 e = p + PyUnicode_GET_SIZE(self);
6810 for (; p < e; p++) {
6811 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006812 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006813 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006814 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006815}
6816
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006817PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006818"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006820Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006821False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822
6823static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006824unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825{
6826 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6827 register const Py_UNICODE *e;
6828
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829 /* Shortcut for single character strings */
6830 if (PyUnicode_GET_SIZE(self) == 1 &&
6831 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006832 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006834 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006835 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006836 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006837
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838 e = p + PyUnicode_GET_SIZE(self);
6839 for (; p < e; p++) {
6840 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006841 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006843 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844}
6845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006846PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006847"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006849Return True if all characters in S are digits\n\
6850and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851
6852static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006853unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854{
6855 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6856 register const Py_UNICODE *e;
6857
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 /* Shortcut for single character strings */
6859 if (PyUnicode_GET_SIZE(self) == 1 &&
6860 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006861 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006863 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006864 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006865 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006866
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 e = p + PyUnicode_GET_SIZE(self);
6868 for (; p < e; p++) {
6869 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006870 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006872 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873}
6874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006875PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006876"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006878Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006879False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880
6881static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006882unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883{
6884 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6885 register const Py_UNICODE *e;
6886
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 /* Shortcut for single character strings */
6888 if (PyUnicode_GET_SIZE(self) == 1 &&
6889 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006890 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006892 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006893 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006894 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006895
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896 e = p + PyUnicode_GET_SIZE(self);
6897 for (; p < e; p++) {
6898 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006899 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006901 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902}
6903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006904PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905"S.join(sequence) -> unicode\n\
6906\n\
6907Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006908sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909
6910static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006911unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006913 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914}
6915
Martin v. Löwis18e16552006-02-15 17:27:45 +00006916static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917unicode_length(PyUnicodeObject *self)
6918{
6919 return self->length;
6920}
6921
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006922PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006923"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924\n\
6925Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006926done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927
6928static PyObject *
6929unicode_ljust(PyUnicodeObject *self, PyObject *args)
6930{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006931 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006932 Py_UNICODE fillchar = ' ';
6933
Martin v. Löwis412fb672006-04-13 06:34:32 +00006934 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 return NULL;
6936
Tim Peters7a29bd52001-09-12 03:03:31 +00006937 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 Py_INCREF(self);
6939 return (PyObject*) self;
6940 }
6941
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006942 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943}
6944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006945PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946"S.lower() -> unicode\n\
6947\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006948Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949
6950static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006951unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953 return fixup(self, fixlower);
6954}
6955
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006956#define LEFTSTRIP 0
6957#define RIGHTSTRIP 1
6958#define BOTHSTRIP 2
6959
6960/* Arrays indexed by above */
6961static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6962
6963#define STRIPNAME(i) (stripformat[i]+3)
6964
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006965/* externally visible for str.strip(unicode) */
6966PyObject *
6967_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6968{
6969 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006970 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006971 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006972 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6973 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006974
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006975 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6976
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006977 i = 0;
6978 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006979 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6980 i++;
6981 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006982 }
6983
6984 j = len;
6985 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006986 do {
6987 j--;
6988 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6989 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006990 }
6991
6992 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006993 Py_INCREF(self);
6994 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006995 }
6996 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006997 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006998}
6999
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000
7001static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007002do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007004 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007005 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007006
7007 i = 0;
7008 if (striptype != RIGHTSTRIP) {
7009 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7010 i++;
7011 }
7012 }
7013
7014 j = len;
7015 if (striptype != LEFTSTRIP) {
7016 do {
7017 j--;
7018 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7019 j++;
7020 }
7021
7022 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7023 Py_INCREF(self);
7024 return (PyObject*)self;
7025 }
7026 else
7027 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028}
7029
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007030
7031static PyObject *
7032do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7033{
7034 PyObject *sep = NULL;
7035
7036 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7037 return NULL;
7038
7039 if (sep != NULL && sep != Py_None) {
7040 if (PyUnicode_Check(sep))
7041 return _PyUnicode_XStrip(self, striptype, sep);
7042 else if (PyString_Check(sep)) {
7043 PyObject *res;
7044 sep = PyUnicode_FromObject(sep);
7045 if (sep==NULL)
7046 return NULL;
7047 res = _PyUnicode_XStrip(self, striptype, sep);
7048 Py_DECREF(sep);
7049 return res;
7050 }
7051 else {
7052 PyErr_Format(PyExc_TypeError,
7053 "%s arg must be None, unicode or str",
7054 STRIPNAME(striptype));
7055 return NULL;
7056 }
7057 }
7058
7059 return do_strip(self, striptype);
7060}
7061
7062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007063PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007064"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007065\n\
7066Return a copy of the string S with leading and trailing\n\
7067whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007068If chars is given and not None, remove characters in chars instead.\n\
7069If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007070
7071static PyObject *
7072unicode_strip(PyUnicodeObject *self, PyObject *args)
7073{
7074 if (PyTuple_GET_SIZE(args) == 0)
7075 return do_strip(self, BOTHSTRIP); /* Common case */
7076 else
7077 return do_argstrip(self, BOTHSTRIP, args);
7078}
7079
7080
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007081PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007082"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007083\n\
7084Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007085If chars is given and not None, remove characters in chars instead.\n\
7086If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007087
7088static PyObject *
7089unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7090{
7091 if (PyTuple_GET_SIZE(args) == 0)
7092 return do_strip(self, LEFTSTRIP); /* Common case */
7093 else
7094 return do_argstrip(self, LEFTSTRIP, args);
7095}
7096
7097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007098PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007099"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007100\n\
7101Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007102If chars is given and not None, remove characters in chars instead.\n\
7103If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007104
7105static PyObject *
7106unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7107{
7108 if (PyTuple_GET_SIZE(args) == 0)
7109 return do_strip(self, RIGHTSTRIP); /* Common case */
7110 else
7111 return do_argstrip(self, RIGHTSTRIP, args);
7112}
7113
7114
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007116unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117{
7118 PyUnicodeObject *u;
7119 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007120 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007121 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122
7123 if (len < 0)
7124 len = 0;
7125
Tim Peters7a29bd52001-09-12 03:03:31 +00007126 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127 /* no repeat, return original string */
7128 Py_INCREF(str);
7129 return (PyObject*) str;
7130 }
Tim Peters8f422462000-09-09 06:13:41 +00007131
7132 /* ensure # of chars needed doesn't overflow int and # of bytes
7133 * needed doesn't overflow size_t
7134 */
7135 nchars = len * str->length;
7136 if (len && nchars / len != str->length) {
7137 PyErr_SetString(PyExc_OverflowError,
7138 "repeated string is too long");
7139 return NULL;
7140 }
7141 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7142 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7143 PyErr_SetString(PyExc_OverflowError,
7144 "repeated string is too long");
7145 return NULL;
7146 }
7147 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148 if (!u)
7149 return NULL;
7150
7151 p = u->str;
7152
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007153 if (str->length == 1 && len > 0) {
7154 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007155 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00007156 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007157 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007158 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007159 done = str->length;
7160 }
7161 while (done < nchars) {
7162 int n = (done <= nchars-done) ? done : nchars-done;
7163 Py_UNICODE_COPY(p+done, p, n);
7164 done += n;
7165 }
7166 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167
7168 return (PyObject*) u;
7169}
7170
7171PyObject *PyUnicode_Replace(PyObject *obj,
7172 PyObject *subobj,
7173 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007174 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175{
7176 PyObject *self;
7177 PyObject *str1;
7178 PyObject *str2;
7179 PyObject *result;
7180
7181 self = PyUnicode_FromObject(obj);
7182 if (self == NULL)
7183 return NULL;
7184 str1 = PyUnicode_FromObject(subobj);
7185 if (str1 == NULL) {
7186 Py_DECREF(self);
7187 return NULL;
7188 }
7189 str2 = PyUnicode_FromObject(replobj);
7190 if (str2 == NULL) {
7191 Py_DECREF(self);
7192 Py_DECREF(str1);
7193 return NULL;
7194 }
Tim Petersced69f82003-09-16 20:30:58 +00007195 result = replace((PyUnicodeObject *)self,
7196 (PyUnicodeObject *)str1,
7197 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198 maxcount);
7199 Py_DECREF(self);
7200 Py_DECREF(str1);
7201 Py_DECREF(str2);
7202 return result;
7203}
7204
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007205PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206"S.replace (old, new[, maxsplit]) -> unicode\n\
7207\n\
7208Return a copy of S with all occurrences of substring\n\
7209old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007210given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211
7212static PyObject*
7213unicode_replace(PyUnicodeObject *self, PyObject *args)
7214{
7215 PyUnicodeObject *str1;
7216 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007217 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218 PyObject *result;
7219
Martin v. Löwis18e16552006-02-15 17:27:45 +00007220 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221 return NULL;
7222 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7223 if (str1 == NULL)
7224 return NULL;
7225 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007226 if (str2 == NULL) {
7227 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230
7231 result = replace(self, str1, str2, maxcount);
7232
7233 Py_DECREF(str1);
7234 Py_DECREF(str2);
7235 return result;
7236}
7237
7238static
7239PyObject *unicode_repr(PyObject *unicode)
7240{
7241 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7242 PyUnicode_GET_SIZE(unicode),
7243 1);
7244}
7245
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007246PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247"S.rfind(sub [,start [,end]]) -> int\n\
7248\n\
7249Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007250such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251arguments start and end are interpreted as in slice notation.\n\
7252\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007253Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254
7255static PyObject *
7256unicode_rfind(PyUnicodeObject *self, PyObject *args)
7257{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007258 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007259 Py_ssize_t start;
7260 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007261 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262
Facundo Batista57d56692007-11-16 18:04:14 +00007263 if (!_ParseTupleFinds(args, &substring, &start, &end))
7264 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007266 result = stringlib_rfind_slice(
7267 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7268 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7269 start, end
7270 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271
7272 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007273
7274 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275}
7276
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007277PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278"S.rindex(sub [,start [,end]]) -> int\n\
7279\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007280Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281
7282static PyObject *
7283unicode_rindex(PyUnicodeObject *self, PyObject *args)
7284{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007285 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007286 Py_ssize_t start;
7287 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007288 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289
Facundo Batista57d56692007-11-16 18:04:14 +00007290 if (!_ParseTupleFinds(args, &substring, &start, &end))
7291 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007293 result = stringlib_rfind_slice(
7294 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7295 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7296 start, end
7297 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298
7299 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007300
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301 if (result < 0) {
7302 PyErr_SetString(PyExc_ValueError, "substring not found");
7303 return NULL;
7304 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007305 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306}
7307
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007308PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007309"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310\n\
7311Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007312done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313
7314static PyObject *
7315unicode_rjust(PyUnicodeObject *self, PyObject *args)
7316{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007317 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007318 Py_UNICODE fillchar = ' ';
7319
Martin v. Löwis412fb672006-04-13 06:34:32 +00007320 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321 return NULL;
7322
Tim Peters7a29bd52001-09-12 03:03:31 +00007323 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324 Py_INCREF(self);
7325 return (PyObject*) self;
7326 }
7327
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007328 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329}
7330
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007332unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333{
7334 /* standard clamping */
7335 if (start < 0)
7336 start = 0;
7337 if (end < 0)
7338 end = 0;
7339 if (end > self->length)
7340 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007341 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342 /* full slice, return original string */
7343 Py_INCREF(self);
7344 return (PyObject*) self;
7345 }
7346 if (start > end)
7347 start = end;
7348 /* copy slice */
7349 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7350 end - start);
7351}
7352
7353PyObject *PyUnicode_Split(PyObject *s,
7354 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007355 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356{
7357 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007358
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359 s = PyUnicode_FromObject(s);
7360 if (s == NULL)
7361 return NULL;
7362 if (sep != NULL) {
7363 sep = PyUnicode_FromObject(sep);
7364 if (sep == NULL) {
7365 Py_DECREF(s);
7366 return NULL;
7367 }
7368 }
7369
7370 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7371
7372 Py_DECREF(s);
7373 Py_XDECREF(sep);
7374 return result;
7375}
7376
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007377PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378"S.split([sep [,maxsplit]]) -> list of strings\n\
7379\n\
7380Return a list of the words in S, using sep as the\n\
7381delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007382splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007383any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384
7385static PyObject*
7386unicode_split(PyUnicodeObject *self, PyObject *args)
7387{
7388 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007389 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390
Martin v. Löwis18e16552006-02-15 17:27:45 +00007391 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392 return NULL;
7393
7394 if (substring == Py_None)
7395 return split(self, NULL, maxcount);
7396 else if (PyUnicode_Check(substring))
7397 return split(self, (PyUnicodeObject *)substring, maxcount);
7398 else
7399 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7400}
7401
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007402PyObject *
7403PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7404{
7405 PyObject* str_obj;
7406 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007407 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007408
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007409 str_obj = PyUnicode_FromObject(str_in);
7410 if (!str_obj)
7411 return NULL;
7412 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007413 if (!sep_obj) {
7414 Py_DECREF(str_obj);
7415 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007416 }
7417
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007418 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007419 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7420 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7421 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007422
Fredrik Lundhb9479482006-05-26 17:22:38 +00007423 Py_DECREF(sep_obj);
7424 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007425
7426 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007427}
7428
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007429
7430PyObject *
7431PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7432{
7433 PyObject* str_obj;
7434 PyObject* sep_obj;
7435 PyObject* out;
7436
7437 str_obj = PyUnicode_FromObject(str_in);
7438 if (!str_obj)
7439 return NULL;
7440 sep_obj = PyUnicode_FromObject(sep_in);
7441 if (!sep_obj) {
7442 Py_DECREF(str_obj);
7443 return NULL;
7444 }
7445
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007446 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007447 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7448 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7449 );
7450
7451 Py_DECREF(sep_obj);
7452 Py_DECREF(str_obj);
7453
7454 return out;
7455}
7456
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007457PyDoc_STRVAR(partition__doc__,
7458"S.partition(sep) -> (head, sep, tail)\n\
7459\n\
7460Searches for the separator sep in S, and returns the part before it,\n\
7461the separator itself, and the part after it. If the separator is not\n\
7462found, returns S and two empty strings.");
7463
7464static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007465unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007466{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007467 return PyUnicode_Partition((PyObject *)self, separator);
7468}
7469
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007470PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007471"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007472\n\
7473Searches for the separator sep in S, starting at the end of S, and returns\n\
7474the part before it, the separator itself, and the part after it. If the\n\
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007475separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007476
7477static PyObject*
7478unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7479{
7480 return PyUnicode_RPartition((PyObject *)self, separator);
7481}
7482
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007483PyObject *PyUnicode_RSplit(PyObject *s,
7484 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007485 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007486{
7487 PyObject *result;
7488
7489 s = PyUnicode_FromObject(s);
7490 if (s == NULL)
7491 return NULL;
7492 if (sep != NULL) {
7493 sep = PyUnicode_FromObject(sep);
7494 if (sep == NULL) {
7495 Py_DECREF(s);
7496 return NULL;
7497 }
7498 }
7499
7500 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7501
7502 Py_DECREF(s);
7503 Py_XDECREF(sep);
7504 return result;
7505}
7506
7507PyDoc_STRVAR(rsplit__doc__,
7508"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7509\n\
7510Return a list of the words in S, using sep as the\n\
7511delimiter string, starting at the end of the string and\n\
7512working to the front. If maxsplit is given, at most maxsplit\n\
7513splits are done. If sep is not specified, any whitespace string\n\
7514is a separator.");
7515
7516static PyObject*
7517unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7518{
7519 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007520 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007521
Martin v. Löwis18e16552006-02-15 17:27:45 +00007522 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007523 return NULL;
7524
7525 if (substring == Py_None)
7526 return rsplit(self, NULL, maxcount);
7527 else if (PyUnicode_Check(substring))
7528 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7529 else
7530 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7531}
7532
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007533PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007534"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535\n\
7536Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007537Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007538is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539
7540static PyObject*
7541unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7542{
Guido van Rossum86662912000-04-11 15:38:46 +00007543 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544
Guido van Rossum86662912000-04-11 15:38:46 +00007545 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546 return NULL;
7547
Guido van Rossum86662912000-04-11 15:38:46 +00007548 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549}
7550
7551static
7552PyObject *unicode_str(PyUnicodeObject *self)
7553{
Fred Drakee4315f52000-05-09 19:53:39 +00007554 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555}
7556
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007557PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558"S.swapcase() -> unicode\n\
7559\n\
7560Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007561and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562
7563static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007564unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566 return fixup(self, fixswapcase);
7567}
7568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007569PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570"S.translate(table) -> unicode\n\
7571\n\
7572Return a copy of the string S, where all characters have been mapped\n\
7573through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007574Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7575Unmapped characters are left untouched. Characters mapped to None\n\
7576are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577
7578static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007579unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580{
Tim Petersced69f82003-09-16 20:30:58 +00007581 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007583 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584 "ignore");
7585}
7586
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007587PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588"S.upper() -> unicode\n\
7589\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007590Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591
7592static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007593unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595 return fixup(self, fixupper);
7596}
7597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007598PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599"S.zfill(width) -> unicode\n\
7600\n\
7601Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007602of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603
7604static PyObject *
7605unicode_zfill(PyUnicodeObject *self, PyObject *args)
7606{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007607 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608 PyUnicodeObject *u;
7609
Martin v. Löwis18e16552006-02-15 17:27:45 +00007610 Py_ssize_t width;
7611 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612 return NULL;
7613
7614 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007615 if (PyUnicode_CheckExact(self)) {
7616 Py_INCREF(self);
7617 return (PyObject*) self;
7618 }
7619 else
7620 return PyUnicode_FromUnicode(
7621 PyUnicode_AS_UNICODE(self),
7622 PyUnicode_GET_SIZE(self)
7623 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624 }
7625
7626 fill = width - self->length;
7627
7628 u = pad(self, fill, 0, '0');
7629
Walter Dörwald068325e2002-04-15 13:36:47 +00007630 if (u == NULL)
7631 return NULL;
7632
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633 if (u->str[fill] == '+' || u->str[fill] == '-') {
7634 /* move sign to beginning of string */
7635 u->str[0] = u->str[fill];
7636 u->str[fill] = '0';
7637 }
7638
7639 return (PyObject*) u;
7640}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641
7642#if 0
7643static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007644unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646 return PyInt_FromLong(unicode_freelist_size);
7647}
7648#endif
7649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007650PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007651"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007653Return True if S starts with the specified prefix, False otherwise.\n\
7654With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007655With optional end, stop comparing S at that position.\n\
7656prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657
7658static PyObject *
7659unicode_startswith(PyUnicodeObject *self,
7660 PyObject *args)
7661{
Georg Brandl24250812006-06-09 18:45:48 +00007662 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007664 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007665 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007666 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667
Georg Brandl24250812006-06-09 18:45:48 +00007668 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007669 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007671 if (PyTuple_Check(subobj)) {
7672 Py_ssize_t i;
7673 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7674 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7675 PyTuple_GET_ITEM(subobj, i));
7676 if (substring == NULL)
7677 return NULL;
7678 result = tailmatch(self, substring, start, end, -1);
7679 Py_DECREF(substring);
7680 if (result) {
7681 Py_RETURN_TRUE;
7682 }
7683 }
7684 /* nothing matched */
7685 Py_RETURN_FALSE;
7686 }
7687 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007689 return NULL;
7690 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007692 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693}
7694
7695
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007696PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007697"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007699Return True if S ends with the specified suffix, False otherwise.\n\
7700With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007701With optional end, stop comparing S at that position.\n\
7702suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703
7704static PyObject *
7705unicode_endswith(PyUnicodeObject *self,
7706 PyObject *args)
7707{
Georg Brandl24250812006-06-09 18:45:48 +00007708 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007710 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007711 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007712 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713
Georg Brandl24250812006-06-09 18:45:48 +00007714 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7715 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007717 if (PyTuple_Check(subobj)) {
7718 Py_ssize_t i;
7719 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7720 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7721 PyTuple_GET_ITEM(subobj, i));
7722 if (substring == NULL)
7723 return NULL;
7724 result = tailmatch(self, substring, start, end, +1);
7725 Py_DECREF(substring);
7726 if (result) {
7727 Py_RETURN_TRUE;
7728 }
7729 }
7730 Py_RETURN_FALSE;
7731 }
7732 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007734 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735
Georg Brandl24250812006-06-09 18:45:48 +00007736 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007738 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739}
7740
7741
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007742
7743static PyObject *
7744unicode_getnewargs(PyUnicodeObject *v)
7745{
7746 return Py_BuildValue("(u#)", v->str, v->length);
7747}
7748
7749
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750static PyMethodDef unicode_methods[] = {
7751
7752 /* Order is according to common usage: often used methods should
7753 appear first, since lookup is done sequentially. */
7754
Georg Brandlecdc0a92006-03-30 12:19:07 +00007755 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007756 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7757 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007758 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007759 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7760 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7761 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7762 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7763 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7764 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7765 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007766 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007767 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7768 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7769 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007770 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007771 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007772/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7773 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7774 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7775 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007776 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007777 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007778 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007779 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007780 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7781 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7782 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7783 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7784 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7785 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7786 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7787 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7788 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7789 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7790 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7791 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7792 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7793 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007794 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007795#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007796 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797#endif
7798
7799#if 0
7800 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007801 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802#endif
7803
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007804 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805 {NULL, NULL}
7806};
7807
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007808static PyObject *
7809unicode_mod(PyObject *v, PyObject *w)
7810{
7811 if (!PyUnicode_Check(v)) {
7812 Py_INCREF(Py_NotImplemented);
7813 return Py_NotImplemented;
7814 }
7815 return PyUnicode_Format(v, w);
7816}
7817
7818static PyNumberMethods unicode_as_number = {
7819 0, /*nb_add*/
7820 0, /*nb_subtract*/
7821 0, /*nb_multiply*/
7822 0, /*nb_divide*/
7823 unicode_mod, /*nb_remainder*/
7824};
7825
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007827 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007828 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007829 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7830 (ssizeargfunc) unicode_getitem, /* sq_item */
7831 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832 0, /* sq_ass_item */
7833 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007834 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835};
7836
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007837static PyObject*
7838unicode_subscript(PyUnicodeObject* self, PyObject* item)
7839{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007840 if (PyIndex_Check(item)) {
7841 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007842 if (i == -1 && PyErr_Occurred())
7843 return NULL;
7844 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007845 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007846 return unicode_getitem(self, i);
7847 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007848 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007849 Py_UNICODE* source_buf;
7850 Py_UNICODE* result_buf;
7851 PyObject* result;
7852
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007853 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007854 &start, &stop, &step, &slicelength) < 0) {
7855 return NULL;
7856 }
7857
7858 if (slicelength <= 0) {
7859 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007860 } else if (start == 0 && step == 1 && slicelength == self->length &&
7861 PyUnicode_CheckExact(self)) {
7862 Py_INCREF(self);
7863 return (PyObject *)self;
7864 } else if (step == 1) {
7865 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007866 } else {
7867 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00007868 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7869 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007870
7871 if (result_buf == NULL)
7872 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007873
7874 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7875 result_buf[i] = source_buf[cur];
7876 }
Tim Petersced69f82003-09-16 20:30:58 +00007877
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007878 result = PyUnicode_FromUnicode(result_buf, slicelength);
7879 PyMem_FREE(result_buf);
7880 return result;
7881 }
7882 } else {
7883 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7884 return NULL;
7885 }
7886}
7887
7888static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007889 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007890 (binaryfunc)unicode_subscript, /* mp_subscript */
7891 (objobjargproc)0, /* mp_ass_subscript */
7892};
7893
Martin v. Löwis18e16552006-02-15 17:27:45 +00007894static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007896 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897 const void **ptr)
7898{
7899 if (index != 0) {
7900 PyErr_SetString(PyExc_SystemError,
7901 "accessing non-existent unicode segment");
7902 return -1;
7903 }
7904 *ptr = (void *) self->str;
7905 return PyUnicode_GET_DATA_SIZE(self);
7906}
7907
Martin v. Löwis18e16552006-02-15 17:27:45 +00007908static Py_ssize_t
7909unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007910 const void **ptr)
7911{
7912 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007913 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007914 return -1;
7915}
7916
7917static int
7918unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007919 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007920{
7921 if (lenp)
7922 *lenp = PyUnicode_GET_DATA_SIZE(self);
7923 return 1;
7924}
7925
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007926static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007928 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007929 const void **ptr)
7930{
7931 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007932
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933 if (index != 0) {
7934 PyErr_SetString(PyExc_SystemError,
7935 "accessing non-existent unicode segment");
7936 return -1;
7937 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007938 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007939 if (str == NULL)
7940 return -1;
7941 *ptr = (void *) PyString_AS_STRING(str);
7942 return PyString_GET_SIZE(str);
7943}
7944
7945/* Helpers for PyUnicode_Format() */
7946
7947static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007948getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007950 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951 if (argidx < arglen) {
7952 (*p_argidx)++;
7953 if (arglen < 0)
7954 return args;
7955 else
7956 return PyTuple_GetItem(args, argidx);
7957 }
7958 PyErr_SetString(PyExc_TypeError,
7959 "not enough arguments for format string");
7960 return NULL;
7961}
7962
7963#define F_LJUST (1<<0)
7964#define F_SIGN (1<<1)
7965#define F_BLANK (1<<2)
7966#define F_ALT (1<<3)
7967#define F_ZERO (1<<4)
7968
Martin v. Löwis18e16552006-02-15 17:27:45 +00007969static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007970strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007972 register Py_ssize_t i;
7973 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974 for (i = len - 1; i >= 0; i--)
7975 buffer[i] = (Py_UNICODE) charbuffer[i];
7976
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977 return len;
7978}
7979
Neal Norwitzfc76d632006-01-10 06:03:13 +00007980static int
7981doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7982{
Tim Peters15231542006-02-16 01:08:01 +00007983 Py_ssize_t result;
7984
Neal Norwitzfc76d632006-01-10 06:03:13 +00007985 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007986 result = strtounicode(buffer, (char *)buffer);
7987 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007988}
7989
7990static int
7991longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7992{
Tim Peters15231542006-02-16 01:08:01 +00007993 Py_ssize_t result;
7994
Neal Norwitzfc76d632006-01-10 06:03:13 +00007995 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007996 result = strtounicode(buffer, (char *)buffer);
7997 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007998}
7999
Guido van Rossum078151d2002-08-11 04:24:12 +00008000/* XXX To save some code duplication, formatfloat/long/int could have been
8001 shared with stringobject.c, converting from 8-bit to Unicode after the
8002 formatting is done. */
8003
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004static int
8005formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008006 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007 int flags,
8008 int prec,
8009 int type,
8010 PyObject *v)
8011{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008012 /* fmt = '%#.' + `prec` + `type`
8013 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 char fmt[20];
8015 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008016
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 x = PyFloat_AsDouble(v);
8018 if (x == -1.0 && PyErr_Occurred())
8019 return -1;
8020 if (prec < 0)
8021 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8023 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008024 /* Worst case length calc to ensure no buffer overrun:
8025
8026 'g' formats:
8027 fmt = %#.<prec>g
8028 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8029 for any double rep.)
8030 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8031
8032 'f' formats:
8033 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8034 len = 1 + 50 + 1 + prec = 52 + prec
8035
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008036 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008037 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008038
8039 */
Georg Brandl7c3b50d2007-07-12 08:38:00 +00008040 if (((type == 'g' || type == 'G') &&
8041 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008042 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008043 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008044 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008045 return -1;
8046 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008047 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8048 (flags&F_ALT) ? "#" : "",
8049 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008050 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051}
8052
Tim Peters38fd5b62000-09-21 05:43:11 +00008053static PyObject*
8054formatlong(PyObject *val, int flags, int prec, int type)
8055{
8056 char *buf;
8057 int i, len;
8058 PyObject *str; /* temporary string object. */
8059 PyUnicodeObject *result;
8060
8061 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8062 if (!str)
8063 return NULL;
8064 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008065 if (!result) {
8066 Py_DECREF(str);
8067 return NULL;
8068 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008069 for (i = 0; i < len; i++)
8070 result->str[i] = buf[i];
8071 result->str[len] = 0;
8072 Py_DECREF(str);
8073 return (PyObject*)result;
8074}
8075
Guido van Rossumd57fd912000-03-10 22:53:23 +00008076static int
8077formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008078 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079 int flags,
8080 int prec,
8081 int type,
8082 PyObject *v)
8083{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008084 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008085 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8086 * + 1 + 1
8087 * = 24
8088 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008089 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008090 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091 long x;
8092
8093 x = PyInt_AsLong(v);
8094 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008095 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008096 if (x < 0 && type == 'u') {
8097 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008098 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008099 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8100 sign = "-";
8101 else
8102 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008104 prec = 1;
8105
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008106 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8107 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008108 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008109 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008110 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008111 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008112 return -1;
8113 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008114
8115 if ((flags & F_ALT) &&
8116 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008117 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008118 * of issues that cause pain:
8119 * - when 0 is being converted, the C standard leaves off
8120 * the '0x' or '0X', which is inconsistent with other
8121 * %#x/%#X conversions and inconsistent with Python's
8122 * hex() function
8123 * - there are platforms that violate the standard and
8124 * convert 0 with the '0x' or '0X'
8125 * (Metrowerks, Compaq Tru64)
8126 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008127 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008128 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008129 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008130 * We can achieve the desired consistency by inserting our
8131 * own '0x' or '0X' prefix, and substituting %x/%X in place
8132 * of %#x/%#X.
8133 *
8134 * Note that this is the same approach as used in
8135 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008136 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008137 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8138 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008139 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008140 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008141 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8142 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008143 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008144 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008145 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008146 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008147 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008148 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149}
8150
8151static int
8152formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008153 size_t buflen,
8154 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008156 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008157 if (PyUnicode_Check(v)) {
8158 if (PyUnicode_GET_SIZE(v) != 1)
8159 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008161 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008163 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008164 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008165 goto onError;
8166 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8167 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168
8169 else {
8170 /* Integer input truncated to a character */
8171 long x;
8172 x = PyInt_AsLong(v);
8173 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008174 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008175#ifdef Py_UNICODE_WIDE
8176 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008177 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008178 "%c arg not in range(0x110000) "
8179 "(wide Python build)");
8180 return -1;
8181 }
8182#else
8183 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008184 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008185 "%c arg not in range(0x10000) "
8186 "(narrow Python build)");
8187 return -1;
8188 }
8189#endif
8190 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191 }
8192 buf[1] = '\0';
8193 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008194
8195 onError:
8196 PyErr_SetString(PyExc_TypeError,
8197 "%c requires int or char");
8198 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199}
8200
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008201/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8202
8203 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8204 chars are formatted. XXX This is a magic number. Each formatting
8205 routine does bounds checking to ensure no overflow, but a better
8206 solution may be to malloc a buffer of appropriate size for each
8207 format. For now, the current solution is sufficient.
8208*/
8209#define FORMATBUFLEN (size_t)120
8210
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211PyObject *PyUnicode_Format(PyObject *format,
8212 PyObject *args)
8213{
8214 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008215 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 int args_owned = 0;
8217 PyUnicodeObject *result = NULL;
8218 PyObject *dict = NULL;
8219 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008220
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221 if (format == NULL || args == NULL) {
8222 PyErr_BadInternalCall();
8223 return NULL;
8224 }
8225 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008226 if (uformat == NULL)
8227 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228 fmt = PyUnicode_AS_UNICODE(uformat);
8229 fmtcnt = PyUnicode_GET_SIZE(uformat);
8230
8231 reslen = rescnt = fmtcnt + 100;
8232 result = _PyUnicode_New(reslen);
8233 if (result == NULL)
8234 goto onError;
8235 res = PyUnicode_AS_UNICODE(result);
8236
8237 if (PyTuple_Check(args)) {
8238 arglen = PyTuple_Size(args);
8239 argidx = 0;
8240 }
8241 else {
8242 arglen = -1;
8243 argidx = -2;
8244 }
Christian Heimese93237d2007-12-19 02:37:44 +00008245 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008246 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247 dict = args;
8248
8249 while (--fmtcnt >= 0) {
8250 if (*fmt != '%') {
8251 if (--rescnt < 0) {
8252 rescnt = fmtcnt + 100;
8253 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008254 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008255 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8257 --rescnt;
8258 }
8259 *res++ = *fmt++;
8260 }
8261 else {
8262 /* Got a format specifier */
8263 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008264 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266 Py_UNICODE c = '\0';
8267 Py_UNICODE fill;
8268 PyObject *v = NULL;
8269 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008270 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008272 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008273 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274
8275 fmt++;
8276 if (*fmt == '(') {
8277 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008278 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279 PyObject *key;
8280 int pcount = 1;
8281
8282 if (dict == NULL) {
8283 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008284 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285 goto onError;
8286 }
8287 ++fmt;
8288 --fmtcnt;
8289 keystart = fmt;
8290 /* Skip over balanced parentheses */
8291 while (pcount > 0 && --fmtcnt >= 0) {
8292 if (*fmt == ')')
8293 --pcount;
8294 else if (*fmt == '(')
8295 ++pcount;
8296 fmt++;
8297 }
8298 keylen = fmt - keystart - 1;
8299 if (fmtcnt < 0 || pcount > 0) {
8300 PyErr_SetString(PyExc_ValueError,
8301 "incomplete format key");
8302 goto onError;
8303 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008304#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008305 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306 then looked up since Python uses strings to hold
8307 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008308 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309 key = PyUnicode_EncodeUTF8(keystart,
8310 keylen,
8311 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008312#else
8313 key = PyUnicode_FromUnicode(keystart, keylen);
8314#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315 if (key == NULL)
8316 goto onError;
8317 if (args_owned) {
8318 Py_DECREF(args);
8319 args_owned = 0;
8320 }
8321 args = PyObject_GetItem(dict, key);
8322 Py_DECREF(key);
8323 if (args == NULL) {
8324 goto onError;
8325 }
8326 args_owned = 1;
8327 arglen = -1;
8328 argidx = -2;
8329 }
8330 while (--fmtcnt >= 0) {
8331 switch (c = *fmt++) {
8332 case '-': flags |= F_LJUST; continue;
8333 case '+': flags |= F_SIGN; continue;
8334 case ' ': flags |= F_BLANK; continue;
8335 case '#': flags |= F_ALT; continue;
8336 case '0': flags |= F_ZERO; continue;
8337 }
8338 break;
8339 }
8340 if (c == '*') {
8341 v = getnextarg(args, arglen, &argidx);
8342 if (v == NULL)
8343 goto onError;
8344 if (!PyInt_Check(v)) {
8345 PyErr_SetString(PyExc_TypeError,
8346 "* wants int");
8347 goto onError;
8348 }
8349 width = PyInt_AsLong(v);
8350 if (width < 0) {
8351 flags |= F_LJUST;
8352 width = -width;
8353 }
8354 if (--fmtcnt >= 0)
8355 c = *fmt++;
8356 }
8357 else if (c >= '0' && c <= '9') {
8358 width = c - '0';
8359 while (--fmtcnt >= 0) {
8360 c = *fmt++;
8361 if (c < '0' || c > '9')
8362 break;
8363 if ((width*10) / 10 != width) {
8364 PyErr_SetString(PyExc_ValueError,
8365 "width too big");
8366 goto onError;
8367 }
8368 width = width*10 + (c - '0');
8369 }
8370 }
8371 if (c == '.') {
8372 prec = 0;
8373 if (--fmtcnt >= 0)
8374 c = *fmt++;
8375 if (c == '*') {
8376 v = getnextarg(args, arglen, &argidx);
8377 if (v == NULL)
8378 goto onError;
8379 if (!PyInt_Check(v)) {
8380 PyErr_SetString(PyExc_TypeError,
8381 "* wants int");
8382 goto onError;
8383 }
8384 prec = PyInt_AsLong(v);
8385 if (prec < 0)
8386 prec = 0;
8387 if (--fmtcnt >= 0)
8388 c = *fmt++;
8389 }
8390 else if (c >= '0' && c <= '9') {
8391 prec = c - '0';
8392 while (--fmtcnt >= 0) {
8393 c = Py_CHARMASK(*fmt++);
8394 if (c < '0' || c > '9')
8395 break;
8396 if ((prec*10) / 10 != prec) {
8397 PyErr_SetString(PyExc_ValueError,
8398 "prec too big");
8399 goto onError;
8400 }
8401 prec = prec*10 + (c - '0');
8402 }
8403 }
8404 } /* prec */
8405 if (fmtcnt >= 0) {
8406 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008407 if (--fmtcnt >= 0)
8408 c = *fmt++;
8409 }
8410 }
8411 if (fmtcnt < 0) {
8412 PyErr_SetString(PyExc_ValueError,
8413 "incomplete format");
8414 goto onError;
8415 }
8416 if (c != '%') {
8417 v = getnextarg(args, arglen, &argidx);
8418 if (v == NULL)
8419 goto onError;
8420 }
8421 sign = 0;
8422 fill = ' ';
8423 switch (c) {
8424
8425 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008426 pbuf = formatbuf;
8427 /* presume that buffer length is at least 1 */
8428 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429 len = 1;
8430 break;
8431
8432 case 's':
8433 case 'r':
8434 if (PyUnicode_Check(v) && c == 's') {
8435 temp = v;
8436 Py_INCREF(temp);
8437 }
8438 else {
8439 PyObject *unicode;
8440 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008441 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442 else
8443 temp = PyObject_Repr(v);
8444 if (temp == NULL)
8445 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008446 if (PyUnicode_Check(temp))
8447 /* nothing to do */;
8448 else if (PyString_Check(temp)) {
8449 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008450 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008452 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008454 Py_DECREF(temp);
8455 temp = unicode;
8456 if (temp == NULL)
8457 goto onError;
8458 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008459 else {
8460 Py_DECREF(temp);
8461 PyErr_SetString(PyExc_TypeError,
8462 "%s argument has non-string str()");
8463 goto onError;
8464 }
8465 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008466 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467 len = PyUnicode_GET_SIZE(temp);
8468 if (prec >= 0 && len > prec)
8469 len = prec;
8470 break;
8471
8472 case 'i':
8473 case 'd':
8474 case 'u':
8475 case 'o':
8476 case 'x':
8477 case 'X':
8478 if (c == 'i')
8479 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008480 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008481 temp = formatlong(v, flags, prec, c);
8482 if (!temp)
8483 goto onError;
8484 pbuf = PyUnicode_AS_UNICODE(temp);
8485 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008486 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008488 else {
8489 pbuf = formatbuf;
8490 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8491 flags, prec, c, v);
8492 if (len < 0)
8493 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008494 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008495 }
8496 if (flags & F_ZERO)
8497 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498 break;
8499
8500 case 'e':
8501 case 'E':
8502 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008503 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504 case 'g':
8505 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008506 if (c == 'F')
8507 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008508 pbuf = formatbuf;
8509 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8510 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511 if (len < 0)
8512 goto onError;
8513 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008514 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515 fill = '0';
8516 break;
8517
8518 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008519 pbuf = formatbuf;
8520 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521 if (len < 0)
8522 goto onError;
8523 break;
8524
8525 default:
8526 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008527 "unsupported format character '%c' (0x%x) "
Armin Rigo7ccbca92006-10-04 12:17:45 +00008528 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008529 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008530 (int)c,
Armin Rigo7ccbca92006-10-04 12:17:45 +00008531 (Py_ssize_t)(fmt - 1 -
8532 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533 goto onError;
8534 }
8535 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008536 if (*pbuf == '-' || *pbuf == '+') {
8537 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538 len--;
8539 }
8540 else if (flags & F_SIGN)
8541 sign = '+';
8542 else if (flags & F_BLANK)
8543 sign = ' ';
8544 else
8545 sign = 0;
8546 }
8547 if (width < len)
8548 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008549 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550 reslen -= rescnt;
8551 rescnt = width + fmtcnt + 100;
8552 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008553 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008554 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008555 PyErr_NoMemory();
8556 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008557 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008558 if (_PyUnicode_Resize(&result, reslen) < 0) {
8559 Py_XDECREF(temp);
8560 goto onError;
8561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562 res = PyUnicode_AS_UNICODE(result)
8563 + reslen - rescnt;
8564 }
8565 if (sign) {
8566 if (fill != ' ')
8567 *res++ = sign;
8568 rescnt--;
8569 if (width > len)
8570 width--;
8571 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008572 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8573 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008574 assert(pbuf[1] == c);
8575 if (fill != ' ') {
8576 *res++ = *pbuf++;
8577 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008578 }
Tim Petersfff53252001-04-12 18:38:48 +00008579 rescnt -= 2;
8580 width -= 2;
8581 if (width < 0)
8582 width = 0;
8583 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008584 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 if (width > len && !(flags & F_LJUST)) {
8586 do {
8587 --rescnt;
8588 *res++ = fill;
8589 } while (--width > len);
8590 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008591 if (fill == ' ') {
8592 if (sign)
8593 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008594 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008595 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008596 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008597 *res++ = *pbuf++;
8598 *res++ = *pbuf++;
8599 }
8600 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008601 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602 res += len;
8603 rescnt -= len;
8604 while (--width >= len) {
8605 --rescnt;
8606 *res++ = ' ';
8607 }
8608 if (dict && (argidx < arglen) && c != '%') {
8609 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008610 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008611 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612 goto onError;
8613 }
8614 Py_XDECREF(temp);
8615 } /* '%' */
8616 } /* until end */
8617 if (argidx < arglen && !dict) {
8618 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008619 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620 goto onError;
8621 }
8622
Thomas Woutersa96affe2006-03-12 00:29:36 +00008623 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8624 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 if (args_owned) {
8626 Py_DECREF(args);
8627 }
8628 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629 return (PyObject *)result;
8630
8631 onError:
8632 Py_XDECREF(result);
8633 Py_DECREF(uformat);
8634 if (args_owned) {
8635 Py_DECREF(args);
8636 }
8637 return NULL;
8638}
8639
8640static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008641 (readbufferproc) unicode_buffer_getreadbuf,
8642 (writebufferproc) unicode_buffer_getwritebuf,
8643 (segcountproc) unicode_buffer_getsegcount,
8644 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645};
8646
Jeremy Hylton938ace62002-07-17 16:30:39 +00008647static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008648unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8649
Tim Peters6d6c1a32001-08-02 04:15:00 +00008650static PyObject *
8651unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8652{
8653 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008654 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008655 char *encoding = NULL;
8656 char *errors = NULL;
8657
Guido van Rossume023fe02001-08-30 03:12:59 +00008658 if (type != &PyUnicode_Type)
8659 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008660 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8661 kwlist, &x, &encoding, &errors))
8662 return NULL;
8663 if (x == NULL)
8664 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008665 if (encoding == NULL && errors == NULL)
8666 return PyObject_Unicode(x);
8667 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008668 return PyUnicode_FromEncodedObject(x, encoding, errors);
8669}
8670
Guido van Rossume023fe02001-08-30 03:12:59 +00008671static PyObject *
8672unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8673{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008674 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008675 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008676
8677 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8678 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8679 if (tmp == NULL)
8680 return NULL;
8681 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008682 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008683 if (pnew == NULL) {
8684 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008685 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008686 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008687 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8688 if (pnew->str == NULL) {
8689 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008690 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008691 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008692 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008693 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008694 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8695 pnew->length = n;
8696 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008697 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008698 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008699}
8700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008701PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008702"unicode(string [, encoding[, errors]]) -> object\n\
8703\n\
8704Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008705encoding defaults to the current default string encoding.\n\
8706errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008707
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008709 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710 "unicode", /* tp_name */
8711 sizeof(PyUnicodeObject), /* tp_size */
8712 0, /* tp_itemsize */
8713 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008714 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008715 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008716 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008718 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008719 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008720 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008722 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723 (hashfunc) unicode_hash, /* tp_hash*/
8724 0, /* tp_call*/
8725 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008726 PyObject_GenericGetAttr, /* tp_getattro */
8727 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008728 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008729 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Neal Norwitzee3a1b52007-02-25 19:44:48 +00008730 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008731 unicode_doc, /* tp_doc */
8732 0, /* tp_traverse */
8733 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008734 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008735 0, /* tp_weaklistoffset */
8736 0, /* tp_iter */
8737 0, /* tp_iternext */
8738 unicode_methods, /* tp_methods */
8739 0, /* tp_members */
8740 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008741 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008742 0, /* tp_dict */
8743 0, /* tp_descr_get */
8744 0, /* tp_descr_set */
8745 0, /* tp_dictoffset */
8746 0, /* tp_init */
8747 0, /* tp_alloc */
8748 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008749 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750};
8751
8752/* Initialize the Unicode implementation */
8753
Thomas Wouters78890102000-07-22 19:25:51 +00008754void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008756 int i;
8757
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008758 /* XXX - move this array to unicodectype.c ? */
8759 Py_UNICODE linebreak[] = {
8760 0x000A, /* LINE FEED */
8761 0x000D, /* CARRIAGE RETURN */
8762 0x001C, /* FILE SEPARATOR */
8763 0x001D, /* GROUP SEPARATOR */
8764 0x001E, /* RECORD SEPARATOR */
8765 0x0085, /* NEXT LINE */
8766 0x2028, /* LINE SEPARATOR */
8767 0x2029, /* PARAGRAPH SEPARATOR */
8768 };
8769
Fred Drakee4315f52000-05-09 19:53:39 +00008770 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008771 unicode_freelist = NULL;
8772 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008774 if (!unicode_empty)
8775 return;
8776
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008777 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008778 for (i = 0; i < 256; i++)
8779 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008780 if (PyType_Ready(&PyUnicode_Type) < 0)
8781 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008782
8783 /* initialize the linebreak bloom filter */
8784 bloom_linebreak = make_bloom_mask(
8785 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8786 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008787
8788 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789}
8790
8791/* Finalize the Unicode implementation */
8792
8793void
Thomas Wouters78890102000-07-22 19:25:51 +00008794_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008795{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008796 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008797 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008799 Py_XDECREF(unicode_empty);
8800 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008801
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008802 for (i = 0; i < 256; i++) {
8803 if (unicode_latin1[i]) {
8804 Py_DECREF(unicode_latin1[i]);
8805 unicode_latin1[i] = NULL;
8806 }
8807 }
8808
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008809 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008810 PyUnicodeObject *v = u;
8811 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008812 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008813 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008814 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008815 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008817 unicode_freelist = NULL;
8818 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008819}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008820
Anthony Baxterac6bd462006-04-13 02:06:09 +00008821#ifdef __cplusplus
8822}
8823#endif
8824
8825
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008826/*
8827Local variables:
8828c-basic-offset: 4
8829indent-tabs-mode: nil
8830End:
8831*/