blob: 793728115222102fba15478109610834fd56abdc [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
Fredrik Lundh77633512006-05-23 19:47:35 +0000166 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000187
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000199 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000205 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 PyErr_NoMemory();
207 return -1;
208 }
209 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000210 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000212 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 }
218 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000219
Guido van Rossumd57fd912000-03-10 22:53:23 +0000220 return 0;
221}
222
223/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000224 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
228
229*/
230
231static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233{
234 register PyUnicodeObject *unicode;
235
Andrew Dalkee0df7622006-05-27 11:04:36 +0000236 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
240 }
241
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000245 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000250 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000251 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 }
255 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000256 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000258 }
259 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266 }
267
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000268 if (!unicode->str) {
269 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000270 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000271 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000272 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
278 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000283 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000288 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290}
291
292static
Guido van Rossum9475a232001-10-05 20:51:39 +0000293void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000299 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 unicode->str = NULL;
301 unicode->length = 0;
302 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000306 }
307 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000313 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000314 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000315 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317}
318
Martin v. Löwis18e16552006-02-15 17:27:45 +0000319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000320{
321 register PyUnicodeObject *v;
322
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
327 }
328 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000329 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 PyErr_BadInternalCall();
331 return -1;
332 }
333
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000337 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000344 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000345 *unicode = (PyObject *)w;
346 return 0;
347 }
348
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360{
361 PyUnicodeObject *unicode;
362
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
366
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
371 }
372
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 if (!unicode)
380 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000381 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000382 unicode_latin1[*u] = unicode;
383 }
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
386 }
387 }
Tim Petersced69f82003-09-16 20:30:58 +0000388
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
392
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396
397 return (PyObject *)unicode;
398}
399
400#ifdef HAVE_WCHAR_H
401
402PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000403 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404{
405 PyUnicodeObject *unicode;
406
407 if (w == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
411
412 unicode = _PyUnicode_New(size);
413 if (!unicode)
414 return NULL;
415
416 /* Copy the wchar_t data into the new object */
417#ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000419#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000420 {
421 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000422 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000424 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 *u++ = *w++;
426 }
427#endif
428
429 return (PyObject *)unicode;
430}
431
Martin v. Löwis18e16552006-02-15 17:27:45 +0000432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433 wchar_t *w,
434 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435{
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
439 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000440
441 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000443 size = PyUnicode_GET_SIZE(unicode) + 1;
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445#ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w, unicode->str, size * sizeof(wchar_t));
447#else
448 {
449 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000450 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000452 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453 *w++ = *u++;
454 }
455#endif
456
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000457 if (size > PyUnicode_GET_SIZE(unicode))
458 return PyUnicode_GET_SIZE(unicode);
459 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 return size;
461}
462
463#endif
464
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000465PyObject *PyUnicode_FromOrdinal(int ordinal)
466{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000467 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000468
469#ifdef Py_UNICODE_WIDE
470 if (ordinal < 0 || ordinal > 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
474 return NULL;
475 }
476#else
477 if (ordinal < 0 || ordinal > 0xffff) {
478 PyErr_SetString(PyExc_ValueError,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
481 return NULL;
482 }
483#endif
484
Hye-Shik Chang40574832004-04-06 07:24:51 +0000485 s[0] = (Py_UNICODE)ordinal;
486 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000487}
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489PyObject *PyUnicode_FromObject(register PyObject *obj)
490{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj)) {
494 Py_INCREF(obj);
495 return obj;
496 }
497 if (PyUnicode_Check(obj)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501 PyUnicode_GET_SIZE(obj));
502 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000503 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504}
505
506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507 const char *encoding,
508 const char *errors)
509{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000511 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 if (obj == NULL) {
515 PyErr_BadInternalCall();
516 return NULL;
517 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000518
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000519#if 0
520 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
523 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000524
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
527
528 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529 if (PyUnicode_Check(obj)) {
530 if (encoding) {
531 PyErr_SetString(PyExc_TypeError,
532 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000533 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000536 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537#else
538 if (PyUnicode_Check(obj)) {
539 PyErr_SetString(PyExc_TypeError,
540 "decoding Unicode is not supported");
541 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000542 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000543#endif
544
545 /* Coerce object */
546 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000547 s = PyString_AS_STRING(obj);
548 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000549 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000550 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 "coercing to Unicode: need string or buffer, "
556 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000557 obj->ob_type->tp_name);
558 goto onError;
559 }
Tim Petersced69f82003-09-16 20:30:58 +0000560
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000561 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 if (len == 0) {
563 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000564 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 }
Tim Petersced69f82003-09-16 20:30:58 +0000566 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000567 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000568
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 return v;
570
571 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573}
574
575PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000576 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 const char *encoding,
578 const char *errors)
579{
580 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000581
582 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000583 encoding = PyUnicode_GetDefaultEncoding();
584
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000587 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000588 else if (strcmp(encoding, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s, size, errors);
593#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596
597 /* Decode via the codec registry */
598 buffer = PyBuffer_FromMemory((void *)s, size);
599 if (buffer == NULL)
600 goto onError;
601 unicode = PyCodec_Decode(buffer, encoding, errors);
602 if (unicode == NULL)
603 goto onError;
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000606 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607 unicode->ob_type->tp_name);
608 Py_DECREF(unicode);
609 goto onError;
610 }
611 Py_DECREF(buffer);
612 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000613
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 onError:
615 Py_XDECREF(buffer);
616 return NULL;
617}
618
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620 const char *encoding,
621 const char *errors)
622{
623 PyObject *v;
624
625 if (!PyUnicode_Check(unicode)) {
626 PyErr_BadArgument();
627 goto onError;
628 }
629
630 if (encoding == NULL)
631 encoding = PyUnicode_GetDefaultEncoding();
632
633 /* Decode via the codec registry */
634 v = PyCodec_Decode(unicode, encoding, errors);
635 if (v == NULL)
636 goto onError;
637 return v;
638
639 onError:
640 return NULL;
641}
642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 const char *encoding,
646 const char *errors)
647{
648 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = PyUnicode_FromUnicode(s, size);
651 if (unicode == NULL)
652 return NULL;
653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654 Py_DECREF(unicode);
655 return v;
656}
657
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659 const char *encoding,
660 const char *errors)
661{
662 PyObject *v;
663
664 if (!PyUnicode_Check(unicode)) {
665 PyErr_BadArgument();
666 goto onError;
667 }
668
669 if (encoding == NULL)
670 encoding = PyUnicode_GetDefaultEncoding();
671
672 /* Encode via the codec registry */
673 v = PyCodec_Encode(unicode, encoding, errors);
674 if (v == NULL)
675 goto onError;
676 return v;
677
678 onError:
679 return NULL;
680}
681
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683 const char *encoding,
684 const char *errors)
685{
686 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000687
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688 if (!PyUnicode_Check(unicode)) {
689 PyErr_BadArgument();
690 goto onError;
691 }
Fred Drakee4315f52000-05-09 19:53:39 +0000692
Tim Petersced69f82003-09-16 20:30:58 +0000693 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000694 encoding = PyUnicode_GetDefaultEncoding();
695
696 /* Shortcuts for common default encodings */
697 if (errors == NULL) {
698 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000699 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000700 else if (strcmp(encoding, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode);
705#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000706 else if (strcmp(encoding, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode);
708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709
710 /* Encode via the codec registry */
711 v = PyCodec_Encode(unicode, encoding, errors);
712 if (v == NULL)
713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 if (!PyString_Check(v)) {
715 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000716 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 v->ob_type->tp_name);
718 Py_DECREF(v);
719 goto onError;
720 }
721 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000722
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 onError:
724 return NULL;
725}
726
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728 const char *errors)
729{
730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
731
732 if (v)
733 return v;
734 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735 if (v && errors == NULL)
736 ((PyUnicodeObject *)unicode)->defenc = v;
737 return v;
738}
739
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
741{
742 if (!PyUnicode_Check(unicode)) {
743 PyErr_BadArgument();
744 goto onError;
745 }
746 return PyUnicode_AS_UNICODE(unicode);
747
748 onError:
749 return NULL;
750}
751
Martin v. Löwis18e16552006-02-15 17:27:45 +0000752Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753{
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
757 }
758 return PyUnicode_GET_SIZE(unicode);
759
760 onError:
761 return -1;
762}
763
Thomas Wouters78890102000-07-22 19:25:51 +0000764const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000765{
766 return unicode_default_encoding;
767}
768
769int PyUnicode_SetDefaultEncoding(const char *encoding)
770{
771 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000772
Fred Drakee4315f52000-05-09 19:53:39 +0000773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v = _PyCodec_Lookup(encoding);
776 if (v == NULL)
777 goto onError;
778 Py_DECREF(v);
779 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000780 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000781 sizeof(unicode_default_encoding));
782 return 0;
783
784 onError:
785 return -1;
786}
787
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000788/* error handling callback helper:
789 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000790 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000791 and adjust various state variables.
792 return 0 on success, -1 on error
793*/
794
795static
796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
799 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000800{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000801 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000802
803 PyObject *restuple = NULL;
804 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000805 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
806 Py_ssize_t requiredsize;
807 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000808 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000809 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000810 int res = -1;
811
812 if (*errorHandler == NULL) {
813 *errorHandler = PyCodec_LookupError(errors);
814 if (*errorHandler == NULL)
815 goto onError;
816 }
817
818 if (*exceptionObject == NULL) {
819 *exceptionObject = PyUnicodeDecodeError_Create(
820 encoding, input, insize, *startinpos, *endinpos, reason);
821 if (*exceptionObject == NULL)
822 goto onError;
823 }
824 else {
825 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
826 goto onError;
827 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
828 goto onError;
829 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
830 goto onError;
831 }
832
833 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
834 if (restuple == NULL)
835 goto onError;
836 if (!PyTuple_Check(restuple)) {
837 PyErr_Format(PyExc_TypeError, &argparse[4]);
838 goto onError;
839 }
840 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
841 goto onError;
842 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000843 newpos = insize+newpos;
844 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000845 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000846 goto onError;
847 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000848
849 /* need more space? (at least enough for what we
850 have+the replacement+the rest of the string (starting
851 at the new input position), so we won't have to check space
852 when there are no errors in the rest of the string) */
853 repptr = PyUnicode_AS_UNICODE(repunicode);
854 repsize = PyUnicode_GET_SIZE(repunicode);
855 requiredsize = *outpos + repsize + insize-newpos;
856 if (requiredsize > outsize) {
857 if (requiredsize<2*outsize)
858 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000859 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000860 goto onError;
861 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
862 }
863 *endinpos = newpos;
864 *inptr = input + newpos;
865 Py_UNICODE_COPY(*outptr, repptr, repsize);
866 *outptr += repsize;
867 *outpos += repsize;
868 /* we made it! */
869 res = 0;
870
871 onError:
872 Py_XDECREF(restuple);
873 return res;
874}
875
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000876/* --- UTF-7 Codec -------------------------------------------------------- */
877
878/* see RFC2152 for details */
879
Tim Petersced69f82003-09-16 20:30:58 +0000880static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881char utf7_special[128] = {
882 /* indicate whether a UTF-7 character is special i.e. cannot be directly
883 encoded:
884 0 - not special
885 1 - special
886 2 - whitespace (optional)
887 3 - RFC2152 Set O (optional) */
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
892 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
894 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
896
897};
898
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000899/* Note: The comparison (c) <= 0 is a trick to work-around gcc
900 warnings about the comparison always being false; since
901 utf7_special[0] is 1, we can safely make that one comparison
902 true */
903
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000904#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000905 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000906 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907 (encodeO && (utf7_special[(c)] == 3)))
908
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000909#define B64(n) \
910 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
911#define B64CHAR(c) \
912 (isalnum(c) || (c) == '+' || (c) == '/')
913#define UB64(c) \
914 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
915 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000916
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000917#define ENCODE(out, ch, bits) \
918 while (bits >= 6) { \
919 *out++ = B64(ch >> (bits-6)); \
920 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921 }
922
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000923#define DECODE(out, ch, bits, surrogate) \
924 while (bits >= 16) { \
925 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
926 bits -= 16; \
927 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000928 /* We have already generated an error for the high surrogate \
929 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000930 surrogate = 0; \
931 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000932 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000933 it in a 16-bit character */ \
934 surrogate = 1; \
935 errmsg = "code pairs are not supported"; \
936 goto utf7Error; \
937 } else { \
938 *out++ = outCh; \
939 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000940 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000941
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000943 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 const char *errors)
945{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000946 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000947 Py_ssize_t startinpos;
948 Py_ssize_t endinpos;
949 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950 const char *e;
951 PyUnicodeObject *unicode;
952 Py_UNICODE *p;
953 const char *errmsg = "";
954 int inShift = 0;
955 unsigned int bitsleft = 0;
956 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000957 int surrogate = 0;
958 PyObject *errorHandler = NULL;
959 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000960
961 unicode = _PyUnicode_New(size);
962 if (!unicode)
963 return NULL;
964 if (size == 0)
965 return (PyObject *)unicode;
966
967 p = unicode->str;
968 e = s + size;
969
970 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000971 Py_UNICODE ch;
972 restart:
973 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974
975 if (inShift) {
976 if ((ch == '-') || !B64CHAR(ch)) {
977 inShift = 0;
978 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000979
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
981 if (bitsleft >= 6) {
982 /* The shift sequence has a partial character in it. If
983 bitsleft < 6 then we could just classify it as padding
984 but that is not the case here */
985
986 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000987 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000988 }
989 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000990 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000991 here so indicate the potential of a misencoded character. */
992
993 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
994 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
995 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000996 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 }
998
999 if (ch == '-') {
1000 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001001 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 inShift = 1;
1003 }
1004 } else if (SPECIAL(ch,0,0)) {
1005 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001006 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 } else {
1008 *p++ = ch;
1009 }
1010 } else {
1011 charsleft = (charsleft << 6) | UB64(ch);
1012 bitsleft += 6;
1013 s++;
1014 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1015 }
1016 }
1017 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001019 s++;
1020 if (s < e && *s == '-') {
1021 s++;
1022 *p++ = '+';
1023 } else
1024 {
1025 inShift = 1;
1026 bitsleft = 0;
1027 }
1028 }
1029 else if (SPECIAL(ch,0,0)) {
1030 errmsg = "unexpected special character";
1031 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001032 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001033 }
1034 else {
1035 *p++ = ch;
1036 s++;
1037 }
1038 continue;
1039 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001040 outpos = p-PyUnicode_AS_UNICODE(unicode);
1041 endinpos = s-starts;
1042 if (unicode_decode_call_errorhandler(
1043 errors, &errorHandler,
1044 "utf7", errmsg,
1045 starts, size, &startinpos, &endinpos, &exc, &s,
1046 (PyObject **)&unicode, &outpos, &p))
1047 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001048 }
1049
1050 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001051 outpos = p-PyUnicode_AS_UNICODE(unicode);
1052 endinpos = size;
1053 if (unicode_decode_call_errorhandler(
1054 errors, &errorHandler,
1055 "utf7", "unterminated shift sequence",
1056 starts, size, &startinpos, &endinpos, &exc, &s,
1057 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001059 if (s < e)
1060 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001063 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001064 goto onError;
1065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001066 Py_XDECREF(errorHandler);
1067 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 return (PyObject *)unicode;
1069
1070onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071 Py_XDECREF(errorHandler);
1072 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 Py_DECREF(unicode);
1074 return NULL;
1075}
1076
1077
1078PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001079 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080 int encodeSetO,
1081 int encodeWhiteSpace,
1082 const char *errors)
1083{
1084 PyObject *v;
1085 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001086 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001087 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001089 unsigned int bitsleft = 0;
1090 unsigned long charsleft = 0;
1091 char * out;
1092 char * start;
1093
1094 if (size == 0)
1095 return PyString_FromStringAndSize(NULL, 0);
1096
1097 v = PyString_FromStringAndSize(NULL, cbAllocated);
1098 if (v == NULL)
1099 return NULL;
1100
1101 start = out = PyString_AS_STRING(v);
1102 for (;i < size; ++i) {
1103 Py_UNICODE ch = s[i];
1104
1105 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001106 if (ch == '+') {
1107 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001108 *out++ = '-';
1109 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1110 charsleft = ch;
1111 bitsleft = 16;
1112 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001113 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001114 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001115 } else {
1116 *out++ = (char) ch;
1117 }
1118 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1120 *out++ = B64(charsleft << (6-bitsleft));
1121 charsleft = 0;
1122 bitsleft = 0;
1123 /* Characters not in the BASE64 set implicitly unshift the sequence
1124 so no '-' is required, except if the character is itself a '-' */
1125 if (B64CHAR(ch) || ch == '-') {
1126 *out++ = '-';
1127 }
1128 inShift = 0;
1129 *out++ = (char) ch;
1130 } else {
1131 bitsleft += 16;
1132 charsleft = (charsleft << 16) | ch;
1133 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1134
1135 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001136 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001137 or '-' then the shift sequence will be terminated implicitly and we
1138 don't have to insert a '-'. */
1139
1140 if (bitsleft == 0) {
1141 if (i + 1 < size) {
1142 Py_UNICODE ch2 = s[i+1];
1143
1144 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001145
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001146 } else if (B64CHAR(ch2) || ch2 == '-') {
1147 *out++ = '-';
1148 inShift = 0;
1149 } else {
1150 inShift = 0;
1151 }
1152
1153 }
1154 else {
1155 *out++ = '-';
1156 inShift = 0;
1157 }
1158 }
Tim Petersced69f82003-09-16 20:30:58 +00001159 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001160 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001161 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001162 if (bitsleft) {
1163 *out++= B64(charsleft << (6-bitsleft) );
1164 *out++ = '-';
1165 }
1166
Tim Peters5de98422002-04-27 18:44:32 +00001167 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001168 return v;
1169}
1170
1171#undef SPECIAL
1172#undef B64
1173#undef B64CHAR
1174#undef UB64
1175#undef ENCODE
1176#undef DECODE
1177
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178/* --- UTF-8 Codec -------------------------------------------------------- */
1179
Tim Petersced69f82003-09-16 20:30:58 +00001180static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181char utf8_code_length[256] = {
1182 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1183 illegal prefix. see RFC 2279 for details */
1184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1200};
1201
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001203 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 const char *errors)
1205{
Walter Dörwald69652032004-09-07 20:24:22 +00001206 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1207}
1208
1209PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001210 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001211 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001212 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001213{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001214 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001216 Py_ssize_t startinpos;
1217 Py_ssize_t endinpos;
1218 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 const char *e;
1220 PyUnicodeObject *unicode;
1221 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001222 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001223 PyObject *errorHandler = NULL;
1224 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
1226 /* Note: size will always be longer than the resulting Unicode
1227 character count */
1228 unicode = _PyUnicode_New(size);
1229 if (!unicode)
1230 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001231 if (size == 0) {
1232 if (consumed)
1233 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236
1237 /* Unpack UTF-8 encoded data */
1238 p = unicode->str;
1239 e = s + size;
1240
1241 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001242 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243
1244 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001245 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 s++;
1247 continue;
1248 }
1249
1250 n = utf8_code_length[ch];
1251
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001252 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001253 if (consumed)
1254 break;
1255 else {
1256 errmsg = "unexpected end of data";
1257 startinpos = s-starts;
1258 endinpos = size;
1259 goto utf8Error;
1260 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262
1263 switch (n) {
1264
1265 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001266 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001267 startinpos = s-starts;
1268 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001269 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001270
1271 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001272 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273 startinpos = s-starts;
1274 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001275 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276
1277 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001278 if ((s[1] & 0xc0) != 0x80) {
1279 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
1283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 errmsg = "illegal encoding";
1289 goto utf8Error;
1290 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001292 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 break;
1294
1295 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001296 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 (s[2] & 0xc0) != 0x80) {
1298 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299 startinpos = s-starts;
1300 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 goto utf8Error;
1302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001304 if (ch < 0x0800) {
1305 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001306 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001307
1308 XXX For wide builds (UCS-4) we should probably try
1309 to recombine the surrogates into a single code
1310 unit.
1311 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001312 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001313 startinpos = s-starts;
1314 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001315 goto utf8Error;
1316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001318 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001319 break;
1320
1321 case 4:
1322 if ((s[1] & 0xc0) != 0x80 ||
1323 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001324 (s[3] & 0xc0) != 0x80) {
1325 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 startinpos = s-starts;
1327 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001328 goto utf8Error;
1329 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001330 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1331 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1332 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001333 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001334 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001335 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001336 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001337 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001338 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 startinpos = s-starts;
1340 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001341 goto utf8Error;
1342 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001343#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001344 *p++ = (Py_UNICODE)ch;
1345#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001346 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001347
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 /* translate from 10000..10FFFF to 0..FFFF */
1349 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001350
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001351 /* high surrogate = top 10 bits added to D800 */
1352 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001353
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001354 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001355 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001356#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 break;
1358
1359 default:
1360 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001361 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001362 startinpos = s-starts;
1363 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001364 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 }
1366 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001367 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001368
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001370 outpos = p-PyUnicode_AS_UNICODE(unicode);
1371 if (unicode_decode_call_errorhandler(
1372 errors, &errorHandler,
1373 "utf8", errmsg,
1374 starts, size, &startinpos, &endinpos, &exc, &s,
1375 (PyObject **)&unicode, &outpos, &p))
1376 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 }
Walter Dörwald69652032004-09-07 20:24:22 +00001378 if (consumed)
1379 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380
1381 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001382 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001383 goto onError;
1384
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001385 Py_XDECREF(errorHandler);
1386 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 return (PyObject *)unicode;
1388
1389onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 Py_XDECREF(errorHandler);
1391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 Py_DECREF(unicode);
1393 return NULL;
1394}
1395
Tim Peters602f7402002-04-27 18:03:26 +00001396/* Allocation strategy: if the string is short, convert into a stack buffer
1397 and allocate exactly as much space needed at the end. Else allocate the
1398 maximum possible needed (4 result bytes per Unicode character), and return
1399 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001400*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001401PyObject *
1402PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001403 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001404 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405{
Tim Peters602f7402002-04-27 18:03:26 +00001406#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001407
Martin v. Löwis18e16552006-02-15 17:27:45 +00001408 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001409 PyObject *v; /* result string object */
1410 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001412 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001413 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001414
Tim Peters602f7402002-04-27 18:03:26 +00001415 assert(s != NULL);
1416 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417
Tim Peters602f7402002-04-27 18:03:26 +00001418 if (size <= MAX_SHORT_UNICHARS) {
1419 /* Write into the stack buffer; nallocated can't overflow.
1420 * At the end, we'll allocate exactly as much heap space as it
1421 * turns out we need.
1422 */
1423 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1424 v = NULL; /* will allocate after we're done */
1425 p = stackbuf;
1426 }
1427 else {
1428 /* Overallocate on the heap, and give the excess back at the end. */
1429 nallocated = size * 4;
1430 if (nallocated / 4 != size) /* overflow! */
1431 return PyErr_NoMemory();
1432 v = PyString_FromStringAndSize(NULL, nallocated);
1433 if (v == NULL)
1434 return NULL;
1435 p = PyString_AS_STRING(v);
1436 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001437
Tim Peters602f7402002-04-27 18:03:26 +00001438 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001439 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001440
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001441 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001442 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001444
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001446 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001447 *p++ = (char)(0xc0 | (ch >> 6));
1448 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001449 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001450 else {
Tim Peters602f7402002-04-27 18:03:26 +00001451 /* Encode UCS2 Unicode ordinals */
1452 if (ch < 0x10000) {
1453 /* Special case: check for high surrogate */
1454 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1455 Py_UCS4 ch2 = s[i];
1456 /* Check for low surrogate and combine the two to
1457 form a UCS4 value */
1458 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001459 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001460 i++;
1461 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001462 }
Tim Peters602f7402002-04-27 18:03:26 +00001463 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001464 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001465 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001466 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1467 *p++ = (char)(0x80 | (ch & 0x3f));
1468 continue;
1469 }
1470encodeUCS4:
1471 /* Encode UCS4 Unicode ordinals */
1472 *p++ = (char)(0xf0 | (ch >> 18));
1473 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1474 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1475 *p++ = (char)(0x80 | (ch & 0x3f));
1476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001478
Tim Peters602f7402002-04-27 18:03:26 +00001479 if (v == NULL) {
1480 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001481 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001482 assert(nneeded <= nallocated);
1483 v = PyString_FromStringAndSize(stackbuf, nneeded);
1484 }
1485 else {
1486 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001487 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001488 assert(nneeded <= nallocated);
1489 _PyString_Resize(&v, nneeded);
1490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001492
Tim Peters602f7402002-04-27 18:03:26 +00001493#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494}
1495
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1497{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
1500 return NULL;
1501 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001502 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1503 PyUnicode_GET_SIZE(unicode),
1504 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505}
1506
1507/* --- UTF-16 Codec ------------------------------------------------------- */
1508
Tim Peters772747b2001-08-09 22:21:55 +00001509PyObject *
1510PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001511 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001512 const char *errors,
1513 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001514{
Walter Dörwald69652032004-09-07 20:24:22 +00001515 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1516}
1517
1518PyObject *
1519PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001520 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001521 const char *errors,
1522 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001523 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001524{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001526 Py_ssize_t startinpos;
1527 Py_ssize_t endinpos;
1528 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529 PyUnicodeObject *unicode;
1530 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001531 const unsigned char *q, *e;
1532 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001533 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001534 /* Offsets from q for retrieving byte pairs in the right order. */
1535#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536 int ihi = 1, ilo = 0;
1537#else
1538 int ihi = 0, ilo = 1;
1539#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 PyObject *errorHandler = NULL;
1541 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542
1543 /* Note: size will always be longer than the resulting Unicode
1544 character count */
1545 unicode = _PyUnicode_New(size);
1546 if (!unicode)
1547 return NULL;
1548 if (size == 0)
1549 return (PyObject *)unicode;
1550
1551 /* Unpack UTF-16 encoded data */
1552 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001553 q = (unsigned char *)s;
1554 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555
1556 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001557 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001559 /* Check for BOM marks (U+FEFF) in the input and adjust current
1560 byte order setting accordingly. In native mode, the leading BOM
1561 mark is skipped, in all other modes, it is copied to the output
1562 stream as-is (giving a ZWNBSP character). */
1563 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001564 if (size >= 2) {
1565 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001566#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001567 if (bom == 0xFEFF) {
1568 q += 2;
1569 bo = -1;
1570 }
1571 else if (bom == 0xFFFE) {
1572 q += 2;
1573 bo = 1;
1574 }
Tim Petersced69f82003-09-16 20:30:58 +00001575#else
Walter Dörwald69652032004-09-07 20:24:22 +00001576 if (bom == 0xFEFF) {
1577 q += 2;
1578 bo = 1;
1579 }
1580 else if (bom == 0xFFFE) {
1581 q += 2;
1582 bo = -1;
1583 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001584#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001585 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587
Tim Peters772747b2001-08-09 22:21:55 +00001588 if (bo == -1) {
1589 /* force LE */
1590 ihi = 1;
1591 ilo = 0;
1592 }
1593 else if (bo == 1) {
1594 /* force BE */
1595 ihi = 0;
1596 ilo = 1;
1597 }
1598
1599 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001601 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001602 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001603 if (consumed)
1604 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001605 errmsg = "truncated data";
1606 startinpos = ((const char *)q)-starts;
1607 endinpos = ((const char *)e)-starts;
1608 goto utf16Error;
1609 /* The remaining input chars are ignored if the callback
1610 chooses to skip the input */
1611 }
1612 ch = (q[ihi] << 8) | q[ilo];
1613
Tim Peters772747b2001-08-09 22:21:55 +00001614 q += 2;
1615
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 if (ch < 0xD800 || ch > 0xDFFF) {
1617 *p++ = ch;
1618 continue;
1619 }
1620
1621 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001622 if (q >= e) {
1623 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001624 startinpos = (((const char *)q)-2)-starts;
1625 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001626 goto utf16Error;
1627 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001628 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001629 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1630 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001631 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001632#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001633 *p++ = ch;
1634 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001635#else
1636 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001637#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001638 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001639 }
1640 else {
1641 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 startinpos = (((const char *)q)-4)-starts;
1643 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001644 goto utf16Error;
1645 }
1646
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001648 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 startinpos = (((const char *)q)-2)-starts;
1650 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001651 /* Fall through to report the error */
1652
1653 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001654 outpos = p-PyUnicode_AS_UNICODE(unicode);
1655 if (unicode_decode_call_errorhandler(
1656 errors, &errorHandler,
1657 "utf16", errmsg,
1658 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1659 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 }
1662
1663 if (byteorder)
1664 *byteorder = bo;
1665
Walter Dörwald69652032004-09-07 20:24:22 +00001666 if (consumed)
1667 *consumed = (const char *)q-starts;
1668
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001670 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671 goto onError;
1672
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001673 Py_XDECREF(errorHandler);
1674 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675 return (PyObject *)unicode;
1676
1677onError:
1678 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001679 Py_XDECREF(errorHandler);
1680 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 return NULL;
1682}
1683
Tim Peters772747b2001-08-09 22:21:55 +00001684PyObject *
1685PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001686 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001687 const char *errors,
1688 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001689{
1690 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001691 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001692#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001693 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001694#else
1695 const int pairs = 0;
1696#endif
Tim Peters772747b2001-08-09 22:21:55 +00001697 /* Offsets from p for storing byte pairs in the right order. */
1698#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699 int ihi = 1, ilo = 0;
1700#else
1701 int ihi = 0, ilo = 1;
1702#endif
1703
1704#define STORECHAR(CH) \
1705 do { \
1706 p[ihi] = ((CH) >> 8) & 0xff; \
1707 p[ilo] = (CH) & 0xff; \
1708 p += 2; \
1709 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001711#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001712 for (i = pairs = 0; i < size; i++)
1713 if (s[i] >= 0x10000)
1714 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001715#endif
Tim Petersced69f82003-09-16 20:30:58 +00001716 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001717 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718 if (v == NULL)
1719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720
Tim Peters772747b2001-08-09 22:21:55 +00001721 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001723 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001724 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001725 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001726
1727 if (byteorder == -1) {
1728 /* force LE */
1729 ihi = 1;
1730 ilo = 0;
1731 }
1732 else if (byteorder == 1) {
1733 /* force BE */
1734 ihi = 0;
1735 ilo = 1;
1736 }
1737
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001738 while (size-- > 0) {
1739 Py_UNICODE ch = *s++;
1740 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001741#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001742 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001743 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1744 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001746#endif
Tim Peters772747b2001-08-09 22:21:55 +00001747 STORECHAR(ch);
1748 if (ch2)
1749 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001752#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753}
1754
1755PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1756{
1757 if (!PyUnicode_Check(unicode)) {
1758 PyErr_BadArgument();
1759 return NULL;
1760 }
1761 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1762 PyUnicode_GET_SIZE(unicode),
1763 NULL,
1764 0);
1765}
1766
1767/* --- Unicode Escape Codec ----------------------------------------------- */
1768
Fredrik Lundh06d12682001-01-24 07:59:11 +00001769static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001770
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001772 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773 const char *errors)
1774{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001776 Py_ssize_t startinpos;
1777 Py_ssize_t endinpos;
1778 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001781 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001783 char* message;
1784 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001785 PyObject *errorHandler = NULL;
1786 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001787
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 /* Escaped strings will always be longer than the resulting
1789 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 length after conversion to the true value.
1791 (but if the error callback returns a long replacement string
1792 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 v = _PyUnicode_New(size);
1794 if (v == NULL)
1795 goto onError;
1796 if (size == 0)
1797 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001798
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001799 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001801
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 while (s < end) {
1803 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001804 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001805 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806
1807 /* Non-escape characters are interpreted as Unicode ordinals */
1808 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001809 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 continue;
1811 }
1812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001813 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 /* \ - Escapes */
1815 s++;
1816 switch (*s++) {
1817
1818 /* \x escapes */
1819 case '\n': break;
1820 case '\\': *p++ = '\\'; break;
1821 case '\'': *p++ = '\''; break;
1822 case '\"': *p++ = '\"'; break;
1823 case 'b': *p++ = '\b'; break;
1824 case 'f': *p++ = '\014'; break; /* FF */
1825 case 't': *p++ = '\t'; break;
1826 case 'n': *p++ = '\n'; break;
1827 case 'r': *p++ = '\r'; break;
1828 case 'v': *p++ = '\013'; break; /* VT */
1829 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1830
1831 /* \OOO (octal) escapes */
1832 case '0': case '1': case '2': case '3':
1833 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001834 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001836 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001838 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001840 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 break;
1842
Fredrik Lundhccc74732001-02-18 22:13:49 +00001843 /* hex escapes */
1844 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001846 digits = 2;
1847 message = "truncated \\xXX escape";
1848 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849
Fredrik Lundhccc74732001-02-18 22:13:49 +00001850 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001852 digits = 4;
1853 message = "truncated \\uXXXX escape";
1854 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001855
Fredrik Lundhccc74732001-02-18 22:13:49 +00001856 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001857 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001858 digits = 8;
1859 message = "truncated \\UXXXXXXXX escape";
1860 hexescape:
1861 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 outpos = p-PyUnicode_AS_UNICODE(v);
1863 if (s+digits>end) {
1864 endinpos = size;
1865 if (unicode_decode_call_errorhandler(
1866 errors, &errorHandler,
1867 "unicodeescape", "end of string in escape sequence",
1868 starts, size, &startinpos, &endinpos, &exc, &s,
1869 (PyObject **)&v, &outpos, &p))
1870 goto onError;
1871 goto nextByte;
1872 }
1873 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001874 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001875 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001876 endinpos = (s+i+1)-starts;
1877 if (unicode_decode_call_errorhandler(
1878 errors, &errorHandler,
1879 "unicodeescape", message,
1880 starts, size, &startinpos, &endinpos, &exc, &s,
1881 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001882 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001883 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001884 }
1885 chr = (chr<<4) & ~0xF;
1886 if (c >= '0' && c <= '9')
1887 chr += c - '0';
1888 else if (c >= 'a' && c <= 'f')
1889 chr += 10 + c - 'a';
1890 else
1891 chr += 10 + c - 'A';
1892 }
1893 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001894 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001895 /* _decoding_error will have already written into the
1896 target buffer. */
1897 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001898 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001899 /* when we get here, chr is a 32-bit unicode character */
1900 if (chr <= 0xffff)
1901 /* UCS-2 character */
1902 *p++ = (Py_UNICODE) chr;
1903 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001904 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001905 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001906#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001907 *p++ = chr;
1908#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001909 chr -= 0x10000L;
1910 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001911 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001913 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001914 endinpos = s-starts;
1915 outpos = p-PyUnicode_AS_UNICODE(v);
1916 if (unicode_decode_call_errorhandler(
1917 errors, &errorHandler,
1918 "unicodeescape", "illegal Unicode character",
1919 starts, size, &startinpos, &endinpos, &exc, &s,
1920 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001921 goto onError;
1922 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001923 break;
1924
1925 /* \N{name} */
1926 case 'N':
1927 message = "malformed \\N character escape";
1928 if (ucnhash_CAPI == NULL) {
1929 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001930 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001931 m = PyImport_ImportModule("unicodedata");
1932 if (m == NULL)
1933 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001934 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001935 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001936 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001937 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001938 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001939 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001940 if (ucnhash_CAPI == NULL)
1941 goto ucnhashError;
1942 }
1943 if (*s == '{') {
1944 const char *start = s+1;
1945 /* look for the closing brace */
1946 while (*s != '}' && s < end)
1947 s++;
1948 if (s > start && s < end && *s == '}') {
1949 /* found a name. look it up in the unicode database */
1950 message = "unknown Unicode character name";
1951 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001952 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001953 goto store;
1954 }
1955 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001956 endinpos = s-starts;
1957 outpos = p-PyUnicode_AS_UNICODE(v);
1958 if (unicode_decode_call_errorhandler(
1959 errors, &errorHandler,
1960 "unicodeescape", message,
1961 starts, size, &startinpos, &endinpos, &exc, &s,
1962 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001963 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001964 break;
1965
1966 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001967 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001968 message = "\\ at end of string";
1969 s--;
1970 endinpos = s-starts;
1971 outpos = p-PyUnicode_AS_UNICODE(v);
1972 if (unicode_decode_call_errorhandler(
1973 errors, &errorHandler,
1974 "unicodeescape", message,
1975 starts, size, &startinpos, &endinpos, &exc, &s,
1976 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001977 goto onError;
1978 }
1979 else {
1980 *p++ = '\\';
1981 *p++ = (unsigned char)s[-1];
1982 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001983 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001985 nextByte:
1986 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001988 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001989 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001990 Py_XDECREF(errorHandler);
1991 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001993
Fredrik Lundhccc74732001-02-18 22:13:49 +00001994ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001995 PyErr_SetString(
1996 PyExc_UnicodeError,
1997 "\\N escapes not supported (can't load unicodedata module)"
1998 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001999 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002000 Py_XDECREF(errorHandler);
2001 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002002 return NULL;
2003
Fredrik Lundhccc74732001-02-18 22:13:49 +00002004onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002006 Py_XDECREF(errorHandler);
2007 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 return NULL;
2009}
2010
2011/* Return a Unicode-Escape string version of the Unicode object.
2012
2013 If quotes is true, the string is enclosed in u"" or u'' quotes as
2014 appropriate.
2015
2016*/
2017
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002018Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002019 Py_ssize_t size,
2020 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002021{
2022 /* like wcschr, but doesn't stop at NULL characters */
2023
2024 while (size-- > 0) {
2025 if (*s == ch)
2026 return s;
2027 s++;
2028 }
2029
2030 return NULL;
2031}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002032
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033static
2034PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002035 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 int quotes)
2037{
2038 PyObject *repr;
2039 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002041 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042
Neal Norwitz17753ec2006-08-21 22:21:19 +00002043 /* XXX(nnorwitz): rather than over-allocating, it would be
2044 better to choose a different scheme. Perhaps scan the
2045 first N-chars of the string and allocate based on that size.
2046 */
2047 /* Initial allocation is based on the longest-possible unichr
2048 escape.
2049
2050 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2051 unichr, so in this case it's the longest unichr escape. In
2052 narrow (UTF-16) builds this is five chars per source unichr
2053 since there are two unichrs in the surrogate pair, so in narrow
2054 (UTF-16) builds it's not the longest unichr escape.
2055
2056 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2057 so in the narrow (UTF-16) build case it's the longest unichr
2058 escape.
2059 */
2060
2061 repr = PyString_FromStringAndSize(NULL,
2062 2
2063#ifdef Py_UNICODE_WIDE
2064 + 10*size
2065#else
2066 + 6*size
2067#endif
2068 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 if (repr == NULL)
2070 return NULL;
2071
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002072 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073
2074 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002076 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002077 !findchar(s, size, '"')) ? '"' : '\'';
2078 }
2079 while (size-- > 0) {
2080 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002081
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002082 /* Escape quotes and backslashes */
2083 if ((quotes &&
2084 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 *p++ = '\\';
2086 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002087 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002088 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002089
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002090#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002091 /* Map 21-bit characters to '\U00xxxxxx' */
2092 else if (ch >= 0x10000) {
2093 *p++ = '\\';
2094 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002095 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2096 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2097 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2098 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2099 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2100 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2101 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002102 *p++ = hexdigit[ch & 0x0000000F];
2103 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002104 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002105#else
2106 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002107 else if (ch >= 0xD800 && ch < 0xDC00) {
2108 Py_UNICODE ch2;
2109 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002110
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002111 ch2 = *s++;
2112 size--;
2113 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2114 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2115 *p++ = '\\';
2116 *p++ = 'U';
2117 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2118 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2119 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2120 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2121 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2122 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2123 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2124 *p++ = hexdigit[ucs & 0x0000000F];
2125 continue;
2126 }
2127 /* Fall through: isolated surrogates are copied as-is */
2128 s--;
2129 size++;
2130 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002131#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002132
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002134 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 *p++ = '\\';
2136 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002137 *p++ = hexdigit[(ch >> 12) & 0x000F];
2138 *p++ = hexdigit[(ch >> 8) & 0x000F];
2139 *p++ = hexdigit[(ch >> 4) & 0x000F];
2140 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002141 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002142
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002143 /* Map special whitespace to '\t', \n', '\r' */
2144 else if (ch == '\t') {
2145 *p++ = '\\';
2146 *p++ = 't';
2147 }
2148 else if (ch == '\n') {
2149 *p++ = '\\';
2150 *p++ = 'n';
2151 }
2152 else if (ch == '\r') {
2153 *p++ = '\\';
2154 *p++ = 'r';
2155 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002156
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002157 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002158 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002160 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002161 *p++ = hexdigit[(ch >> 4) & 0x000F];
2162 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002163 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002164
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 /* Copy everything else as-is */
2166 else
2167 *p++ = (char) ch;
2168 }
2169 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002170 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002171
2172 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002173 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174 return repr;
2175}
2176
2177PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002178 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002179{
2180 return unicodeescape_string(s, size, 0);
2181}
2182
2183PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2184{
2185 if (!PyUnicode_Check(unicode)) {
2186 PyErr_BadArgument();
2187 return NULL;
2188 }
2189 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2190 PyUnicode_GET_SIZE(unicode));
2191}
2192
2193/* --- Raw Unicode Escape Codec ------------------------------------------- */
2194
2195PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002196 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 const char *errors)
2198{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002199 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002200 Py_ssize_t startinpos;
2201 Py_ssize_t endinpos;
2202 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002204 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205 const char *end;
2206 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002207 PyObject *errorHandler = NULL;
2208 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002209
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210 /* Escaped strings will always be longer than the resulting
2211 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002212 length after conversion to the true value. (But decoding error
2213 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214 v = _PyUnicode_New(size);
2215 if (v == NULL)
2216 goto onError;
2217 if (size == 0)
2218 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002219 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220 end = s + size;
2221 while (s < end) {
2222 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002223 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002225 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226
2227 /* Non-escape characters are interpreted as Unicode ordinals */
2228 if (*s != '\\') {
2229 *p++ = (unsigned char)*s++;
2230 continue;
2231 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002232 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233
2234 /* \u-escapes are only interpreted iff the number of leading
2235 backslashes if odd */
2236 bs = s;
2237 for (;s < end;) {
2238 if (*s != '\\')
2239 break;
2240 *p++ = (unsigned char)*s++;
2241 }
2242 if (((s - bs) & 1) == 0 ||
2243 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002244 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245 continue;
2246 }
2247 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002248 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249 s++;
2250
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002251 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002252 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002253 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002254 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002255 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002256 endinpos = s-starts;
2257 if (unicode_decode_call_errorhandler(
2258 errors, &errorHandler,
2259 "rawunicodeescape", "truncated \\uXXXX",
2260 starts, size, &startinpos, &endinpos, &exc, &s,
2261 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002263 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002264 }
2265 x = (x<<4) & ~0xF;
2266 if (c >= '0' && c <= '9')
2267 x += c - '0';
2268 else if (c >= 'a' && c <= 'f')
2269 x += 10 + c - 'a';
2270 else
2271 x += 10 + c - 'A';
2272 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002273#ifndef Py_UNICODE_WIDE
2274 if (x > 0x10000) {
2275 if (unicode_decode_call_errorhandler(
2276 errors, &errorHandler,
2277 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2278 starts, size, &startinpos, &endinpos, &exc, &s,
2279 (PyObject **)&v, &outpos, &p))
2280 goto onError;
2281 }
2282#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002283 *p++ = x;
2284 nextByte:
2285 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002286 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002287 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002288 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002289 Py_XDECREF(errorHandler);
2290 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002292
Guido van Rossumd57fd912000-03-10 22:53:23 +00002293 onError:
2294 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002295 Py_XDECREF(errorHandler);
2296 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297 return NULL;
2298}
2299
2300PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002301 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302{
2303 PyObject *repr;
2304 char *p;
2305 char *q;
2306
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002307 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002309#ifdef Py_UNICODE_WIDE
2310 repr = PyString_FromStringAndSize(NULL, 10 * size);
2311#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002313#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002314 if (repr == NULL)
2315 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002316 if (size == 0)
2317 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002318
2319 p = q = PyString_AS_STRING(repr);
2320 while (size-- > 0) {
2321 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002322#ifdef Py_UNICODE_WIDE
2323 /* Map 32-bit characters to '\Uxxxxxxxx' */
2324 if (ch >= 0x10000) {
2325 *p++ = '\\';
2326 *p++ = 'U';
2327 *p++ = hexdigit[(ch >> 28) & 0xf];
2328 *p++ = hexdigit[(ch >> 24) & 0xf];
2329 *p++ = hexdigit[(ch >> 20) & 0xf];
2330 *p++ = hexdigit[(ch >> 16) & 0xf];
2331 *p++ = hexdigit[(ch >> 12) & 0xf];
2332 *p++ = hexdigit[(ch >> 8) & 0xf];
2333 *p++ = hexdigit[(ch >> 4) & 0xf];
2334 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002335 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002336 else
2337#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002338 /* Map 16-bit characters to '\uxxxx' */
2339 if (ch >= 256) {
2340 *p++ = '\\';
2341 *p++ = 'u';
2342 *p++ = hexdigit[(ch >> 12) & 0xf];
2343 *p++ = hexdigit[(ch >> 8) & 0xf];
2344 *p++ = hexdigit[(ch >> 4) & 0xf];
2345 *p++ = hexdigit[ch & 15];
2346 }
2347 /* Copy everything else as-is */
2348 else
2349 *p++ = (char) ch;
2350 }
2351 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002352 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002353 return repr;
2354}
2355
2356PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2357{
2358 if (!PyUnicode_Check(unicode)) {
2359 PyErr_BadArgument();
2360 return NULL;
2361 }
2362 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2363 PyUnicode_GET_SIZE(unicode));
2364}
2365
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002366/* --- Unicode Internal Codec ------------------------------------------- */
2367
2368PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002369 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002370 const char *errors)
2371{
2372 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002373 Py_ssize_t startinpos;
2374 Py_ssize_t endinpos;
2375 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002376 PyUnicodeObject *v;
2377 Py_UNICODE *p;
2378 const char *end;
2379 const char *reason;
2380 PyObject *errorHandler = NULL;
2381 PyObject *exc = NULL;
2382
Neal Norwitzd43069c2006-01-08 01:12:10 +00002383#ifdef Py_UNICODE_WIDE
2384 Py_UNICODE unimax = PyUnicode_GetMax();
2385#endif
2386
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002387 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2388 if (v == NULL)
2389 goto onError;
2390 if (PyUnicode_GetSize((PyObject *)v) == 0)
2391 return (PyObject *)v;
2392 p = PyUnicode_AS_UNICODE(v);
2393 end = s + size;
2394
2395 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002396 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002397 /* We have to sanity check the raw data, otherwise doom looms for
2398 some malformed UCS-4 data. */
2399 if (
2400 #ifdef Py_UNICODE_WIDE
2401 *p > unimax || *p < 0 ||
2402 #endif
2403 end-s < Py_UNICODE_SIZE
2404 )
2405 {
2406 startinpos = s - starts;
2407 if (end-s < Py_UNICODE_SIZE) {
2408 endinpos = end-starts;
2409 reason = "truncated input";
2410 }
2411 else {
2412 endinpos = s - starts + Py_UNICODE_SIZE;
2413 reason = "illegal code point (> 0x10FFFF)";
2414 }
2415 outpos = p - PyUnicode_AS_UNICODE(v);
2416 if (unicode_decode_call_errorhandler(
2417 errors, &errorHandler,
2418 "unicode_internal", reason,
2419 starts, size, &startinpos, &endinpos, &exc, &s,
2420 (PyObject **)&v, &outpos, &p)) {
2421 goto onError;
2422 }
2423 }
2424 else {
2425 p++;
2426 s += Py_UNICODE_SIZE;
2427 }
2428 }
2429
Martin v. Löwis412fb672006-04-13 06:34:32 +00002430 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002431 goto onError;
2432 Py_XDECREF(errorHandler);
2433 Py_XDECREF(exc);
2434 return (PyObject *)v;
2435
2436 onError:
2437 Py_XDECREF(v);
2438 Py_XDECREF(errorHandler);
2439 Py_XDECREF(exc);
2440 return NULL;
2441}
2442
Guido van Rossumd57fd912000-03-10 22:53:23 +00002443/* --- Latin-1 Codec ------------------------------------------------------ */
2444
2445PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002446 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447 const char *errors)
2448{
2449 PyUnicodeObject *v;
2450 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002451
Guido van Rossumd57fd912000-03-10 22:53:23 +00002452 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002453 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002454 Py_UNICODE r = *(unsigned char*)s;
2455 return PyUnicode_FromUnicode(&r, 1);
2456 }
2457
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458 v = _PyUnicode_New(size);
2459 if (v == NULL)
2460 goto onError;
2461 if (size == 0)
2462 return (PyObject *)v;
2463 p = PyUnicode_AS_UNICODE(v);
2464 while (size-- > 0)
2465 *p++ = (unsigned char)*s++;
2466 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002467
Guido van Rossumd57fd912000-03-10 22:53:23 +00002468 onError:
2469 Py_XDECREF(v);
2470 return NULL;
2471}
2472
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002473/* create or adjust a UnicodeEncodeError */
2474static void make_encode_exception(PyObject **exceptionObject,
2475 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002476 const Py_UNICODE *unicode, Py_ssize_t size,
2477 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002478 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002480 if (*exceptionObject == NULL) {
2481 *exceptionObject = PyUnicodeEncodeError_Create(
2482 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483 }
2484 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002485 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2486 goto onError;
2487 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2488 goto onError;
2489 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2490 goto onError;
2491 return;
2492 onError:
2493 Py_DECREF(*exceptionObject);
2494 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495 }
2496}
2497
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002498/* raises a UnicodeEncodeError */
2499static void raise_encode_exception(PyObject **exceptionObject,
2500 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002501 const Py_UNICODE *unicode, Py_ssize_t size,
2502 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002503 const char *reason)
2504{
2505 make_encode_exception(exceptionObject,
2506 encoding, unicode, size, startpos, endpos, reason);
2507 if (*exceptionObject != NULL)
2508 PyCodec_StrictErrors(*exceptionObject);
2509}
2510
2511/* error handling callback helper:
2512 build arguments, call the callback and check the arguments,
2513 put the result into newpos and return the replacement string, which
2514 has to be freed by the caller */
2515static PyObject *unicode_encode_call_errorhandler(const char *errors,
2516 PyObject **errorHandler,
2517 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002518 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2519 Py_ssize_t startpos, Py_ssize_t endpos,
2520 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002521{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002522 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002523
2524 PyObject *restuple;
2525 PyObject *resunicode;
2526
2527 if (*errorHandler == NULL) {
2528 *errorHandler = PyCodec_LookupError(errors);
2529 if (*errorHandler == NULL)
2530 return NULL;
2531 }
2532
2533 make_encode_exception(exceptionObject,
2534 encoding, unicode, size, startpos, endpos, reason);
2535 if (*exceptionObject == NULL)
2536 return NULL;
2537
2538 restuple = PyObject_CallFunctionObjArgs(
2539 *errorHandler, *exceptionObject, NULL);
2540 if (restuple == NULL)
2541 return NULL;
2542 if (!PyTuple_Check(restuple)) {
2543 PyErr_Format(PyExc_TypeError, &argparse[4]);
2544 Py_DECREF(restuple);
2545 return NULL;
2546 }
2547 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2548 &resunicode, newpos)) {
2549 Py_DECREF(restuple);
2550 return NULL;
2551 }
2552 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002553 *newpos = size+*newpos;
2554 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002555 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002556 Py_DECREF(restuple);
2557 return NULL;
2558 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002559 Py_INCREF(resunicode);
2560 Py_DECREF(restuple);
2561 return resunicode;
2562}
2563
2564static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002565 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002566 const char *errors,
2567 int limit)
2568{
2569 /* output object */
2570 PyObject *res;
2571 /* pointers to the beginning and end+1 of input */
2572 const Py_UNICODE *startp = p;
2573 const Py_UNICODE *endp = p + size;
2574 /* pointer to the beginning of the unencodable characters */
2575 /* const Py_UNICODE *badp = NULL; */
2576 /* pointer into the output */
2577 char *str;
2578 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002579 Py_ssize_t respos = 0;
2580 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002581 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2582 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002583 PyObject *errorHandler = NULL;
2584 PyObject *exc = NULL;
2585 /* the following variable is used for caching string comparisons
2586 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2587 int known_errorHandler = -1;
2588
2589 /* allocate enough for a simple encoding without
2590 replacements, if we need more, we'll resize */
2591 res = PyString_FromStringAndSize(NULL, size);
2592 if (res == NULL)
2593 goto onError;
2594 if (size == 0)
2595 return res;
2596 str = PyString_AS_STRING(res);
2597 ressize = size;
2598
2599 while (p<endp) {
2600 Py_UNICODE c = *p;
2601
2602 /* can we encode this? */
2603 if (c<limit) {
2604 /* no overflow check, because we know that the space is enough */
2605 *str++ = (char)c;
2606 ++p;
2607 }
2608 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002609 Py_ssize_t unicodepos = p-startp;
2610 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002611 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002612 Py_ssize_t repsize;
2613 Py_ssize_t newpos;
2614 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002615 Py_UNICODE *uni2;
2616 /* startpos for collecting unencodable chars */
2617 const Py_UNICODE *collstart = p;
2618 const Py_UNICODE *collend = p;
2619 /* find all unecodable characters */
2620 while ((collend < endp) && ((*collend)>=limit))
2621 ++collend;
2622 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2623 if (known_errorHandler==-1) {
2624 if ((errors==NULL) || (!strcmp(errors, "strict")))
2625 known_errorHandler = 1;
2626 else if (!strcmp(errors, "replace"))
2627 known_errorHandler = 2;
2628 else if (!strcmp(errors, "ignore"))
2629 known_errorHandler = 3;
2630 else if (!strcmp(errors, "xmlcharrefreplace"))
2631 known_errorHandler = 4;
2632 else
2633 known_errorHandler = 0;
2634 }
2635 switch (known_errorHandler) {
2636 case 1: /* strict */
2637 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2638 goto onError;
2639 case 2: /* replace */
2640 while (collstart++<collend)
2641 *str++ = '?'; /* fall through */
2642 case 3: /* ignore */
2643 p = collend;
2644 break;
2645 case 4: /* xmlcharrefreplace */
2646 respos = str-PyString_AS_STRING(res);
2647 /* determine replacement size (temporarily (mis)uses p) */
2648 for (p = collstart, repsize = 0; p < collend; ++p) {
2649 if (*p<10)
2650 repsize += 2+1+1;
2651 else if (*p<100)
2652 repsize += 2+2+1;
2653 else if (*p<1000)
2654 repsize += 2+3+1;
2655 else if (*p<10000)
2656 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002657#ifndef Py_UNICODE_WIDE
2658 else
2659 repsize += 2+5+1;
2660#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002661 else if (*p<100000)
2662 repsize += 2+5+1;
2663 else if (*p<1000000)
2664 repsize += 2+6+1;
2665 else
2666 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002667#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002668 }
2669 requiredsize = respos+repsize+(endp-collend);
2670 if (requiredsize > ressize) {
2671 if (requiredsize<2*ressize)
2672 requiredsize = 2*ressize;
2673 if (_PyString_Resize(&res, requiredsize))
2674 goto onError;
2675 str = PyString_AS_STRING(res) + respos;
2676 ressize = requiredsize;
2677 }
2678 /* generate replacement (temporarily (mis)uses p) */
2679 for (p = collstart; p < collend; ++p) {
2680 str += sprintf(str, "&#%d;", (int)*p);
2681 }
2682 p = collend;
2683 break;
2684 default:
2685 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2686 encoding, reason, startp, size, &exc,
2687 collstart-startp, collend-startp, &newpos);
2688 if (repunicode == NULL)
2689 goto onError;
2690 /* need more space? (at least enough for what we
2691 have+the replacement+the rest of the string, so
2692 we won't have to check space for encodable characters) */
2693 respos = str-PyString_AS_STRING(res);
2694 repsize = PyUnicode_GET_SIZE(repunicode);
2695 requiredsize = respos+repsize+(endp-collend);
2696 if (requiredsize > ressize) {
2697 if (requiredsize<2*ressize)
2698 requiredsize = 2*ressize;
2699 if (_PyString_Resize(&res, requiredsize)) {
2700 Py_DECREF(repunicode);
2701 goto onError;
2702 }
2703 str = PyString_AS_STRING(res) + respos;
2704 ressize = requiredsize;
2705 }
2706 /* check if there is anything unencodable in the replacement
2707 and copy it to the output */
2708 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2709 c = *uni2;
2710 if (c >= limit) {
2711 raise_encode_exception(&exc, encoding, startp, size,
2712 unicodepos, unicodepos+1, reason);
2713 Py_DECREF(repunicode);
2714 goto onError;
2715 }
2716 *str = (char)c;
2717 }
2718 p = startp + newpos;
2719 Py_DECREF(repunicode);
2720 }
2721 }
2722 }
2723 /* Resize if we allocated to much */
2724 respos = str-PyString_AS_STRING(res);
2725 if (respos<ressize)
2726 /* If this falls res will be NULL */
2727 _PyString_Resize(&res, respos);
2728 Py_XDECREF(errorHandler);
2729 Py_XDECREF(exc);
2730 return res;
2731
2732 onError:
2733 Py_XDECREF(res);
2734 Py_XDECREF(errorHandler);
2735 Py_XDECREF(exc);
2736 return NULL;
2737}
2738
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002740 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 const char *errors)
2742{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002743 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744}
2745
2746PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2747{
2748 if (!PyUnicode_Check(unicode)) {
2749 PyErr_BadArgument();
2750 return NULL;
2751 }
2752 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2753 PyUnicode_GET_SIZE(unicode),
2754 NULL);
2755}
2756
2757/* --- 7-bit ASCII Codec -------------------------------------------------- */
2758
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002760 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 const char *errors)
2762{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002763 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 PyUnicodeObject *v;
2765 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002766 Py_ssize_t startinpos;
2767 Py_ssize_t endinpos;
2768 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002769 const char *e;
2770 PyObject *errorHandler = NULL;
2771 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002772
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002774 if (size == 1 && *(unsigned char*)s < 128) {
2775 Py_UNICODE r = *(unsigned char*)s;
2776 return PyUnicode_FromUnicode(&r, 1);
2777 }
Tim Petersced69f82003-09-16 20:30:58 +00002778
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779 v = _PyUnicode_New(size);
2780 if (v == NULL)
2781 goto onError;
2782 if (size == 0)
2783 return (PyObject *)v;
2784 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002785 e = s + size;
2786 while (s < e) {
2787 register unsigned char c = (unsigned char)*s;
2788 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002790 ++s;
2791 }
2792 else {
2793 startinpos = s-starts;
2794 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002795 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002796 if (unicode_decode_call_errorhandler(
2797 errors, &errorHandler,
2798 "ascii", "ordinal not in range(128)",
2799 starts, size, &startinpos, &endinpos, &exc, &s,
2800 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002802 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002804 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002805 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002806 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002807 Py_XDECREF(errorHandler);
2808 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002810
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 onError:
2812 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002813 Py_XDECREF(errorHandler);
2814 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 return NULL;
2816}
2817
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002819 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002820 const char *errors)
2821{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823}
2824
2825PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2826{
2827 if (!PyUnicode_Check(unicode)) {
2828 PyErr_BadArgument();
2829 return NULL;
2830 }
2831 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2832 PyUnicode_GET_SIZE(unicode),
2833 NULL);
2834}
2835
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002836#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002837
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002838/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002839
Martin v. Löwisd8251432006-06-14 05:21:04 +00002840#if SIZEOF_INT < SIZEOF_SSIZE_T
2841#define NEED_RETRY
2842#endif
2843
2844/* XXX This code is limited to "true" double-byte encodings, as
2845 a) it assumes an incomplete character consists of a single byte, and
2846 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2847 encodings, see IsDBCSLeadByteEx documentation. */
2848
2849static int is_dbcs_lead_byte(const char *s, int offset)
2850{
2851 const char *curr = s + offset;
2852
2853 if (IsDBCSLeadByte(*curr)) {
2854 const char *prev = CharPrev(s, curr);
2855 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2856 }
2857 return 0;
2858}
2859
2860/*
2861 * Decode MBCS string into unicode object. If 'final' is set, converts
2862 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2863 */
2864static int decode_mbcs(PyUnicodeObject **v,
2865 const char *s, /* MBCS string */
2866 int size, /* sizeof MBCS string */
2867 int final)
2868{
2869 Py_UNICODE *p;
2870 Py_ssize_t n = 0;
2871 int usize = 0;
2872
2873 assert(size >= 0);
2874
2875 /* Skip trailing lead-byte unless 'final' is set */
2876 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2877 --size;
2878
2879 /* First get the size of the result */
2880 if (size > 0) {
2881 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2882 if (usize == 0) {
2883 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2884 return -1;
2885 }
2886 }
2887
2888 if (*v == NULL) {
2889 /* Create unicode object */
2890 *v = _PyUnicode_New(usize);
2891 if (*v == NULL)
2892 return -1;
2893 }
2894 else {
2895 /* Extend unicode object */
2896 n = PyUnicode_GET_SIZE(*v);
2897 if (_PyUnicode_Resize(v, n + usize) < 0)
2898 return -1;
2899 }
2900
2901 /* Do the conversion */
2902 if (size > 0) {
2903 p = PyUnicode_AS_UNICODE(*v) + n;
2904 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2905 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2906 return -1;
2907 }
2908 }
2909
2910 return size;
2911}
2912
2913PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2914 Py_ssize_t size,
2915 const char *errors,
2916 Py_ssize_t *consumed)
2917{
2918 PyUnicodeObject *v = NULL;
2919 int done;
2920
2921 if (consumed)
2922 *consumed = 0;
2923
2924#ifdef NEED_RETRY
2925 retry:
2926 if (size > INT_MAX)
2927 done = decode_mbcs(&v, s, INT_MAX, 0);
2928 else
2929#endif
2930 done = decode_mbcs(&v, s, (int)size, !consumed);
2931
2932 if (done < 0) {
2933 Py_XDECREF(v);
2934 return NULL;
2935 }
2936
2937 if (consumed)
2938 *consumed += done;
2939
2940#ifdef NEED_RETRY
2941 if (size > INT_MAX) {
2942 s += done;
2943 size -= done;
2944 goto retry;
2945 }
2946#endif
2947
2948 return (PyObject *)v;
2949}
2950
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002951PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002952 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002953 const char *errors)
2954{
Martin v. Löwisd8251432006-06-14 05:21:04 +00002955 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2956}
2957
2958/*
2959 * Convert unicode into string object (MBCS).
2960 * Returns 0 if succeed, -1 otherwise.
2961 */
2962static int encode_mbcs(PyObject **repr,
2963 const Py_UNICODE *p, /* unicode */
2964 int size) /* size of unicode */
2965{
2966 int mbcssize = 0;
2967 Py_ssize_t n = 0;
2968
2969 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002970
2971 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00002972 if (size > 0) {
2973 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2974 if (mbcssize == 0) {
2975 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2976 return -1;
2977 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002978 }
2979
Martin v. Löwisd8251432006-06-14 05:21:04 +00002980 if (*repr == NULL) {
2981 /* Create string object */
2982 *repr = PyString_FromStringAndSize(NULL, mbcssize);
2983 if (*repr == NULL)
2984 return -1;
2985 }
2986 else {
2987 /* Extend string object */
2988 n = PyString_Size(*repr);
2989 if (_PyString_Resize(repr, n + mbcssize) < 0)
2990 return -1;
2991 }
2992
2993 /* Do the conversion */
2994 if (size > 0) {
2995 char *s = PyString_AS_STRING(*repr) + n;
2996 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2997 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2998 return -1;
2999 }
3000 }
3001
3002 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003003}
3004
3005PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003006 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003007 const char *errors)
3008{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003009 PyObject *repr = NULL;
3010 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003011
Martin v. Löwisd8251432006-06-14 05:21:04 +00003012#ifdef NEED_RETRY
3013 retry:
3014 if (size > INT_MAX)
3015 ret = encode_mbcs(&repr, p, INT_MAX);
3016 else
3017#endif
3018 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003019
Martin v. Löwisd8251432006-06-14 05:21:04 +00003020 if (ret < 0) {
3021 Py_XDECREF(repr);
3022 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003023 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003024
3025#ifdef NEED_RETRY
3026 if (size > INT_MAX) {
3027 p += INT_MAX;
3028 size -= INT_MAX;
3029 goto retry;
3030 }
3031#endif
3032
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003033 return repr;
3034}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003035
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003036PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3037{
3038 if (!PyUnicode_Check(unicode)) {
3039 PyErr_BadArgument();
3040 return NULL;
3041 }
3042 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3043 PyUnicode_GET_SIZE(unicode),
3044 NULL);
3045}
3046
Martin v. Löwisd8251432006-06-14 05:21:04 +00003047#undef NEED_RETRY
3048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003049#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003050
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051/* --- Character Mapping Codec -------------------------------------------- */
3052
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003054 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055 PyObject *mapping,
3056 const char *errors)
3057{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003058 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003059 Py_ssize_t startinpos;
3060 Py_ssize_t endinpos;
3061 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003062 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 PyUnicodeObject *v;
3064 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003065 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003066 PyObject *errorHandler = NULL;
3067 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003068 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003069 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003070
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071 /* Default to Latin-1 */
3072 if (mapping == NULL)
3073 return PyUnicode_DecodeLatin1(s, size, errors);
3074
3075 v = _PyUnicode_New(size);
3076 if (v == NULL)
3077 goto onError;
3078 if (size == 0)
3079 return (PyObject *)v;
3080 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003081 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003082 if (PyUnicode_CheckExact(mapping)) {
3083 mapstring = PyUnicode_AS_UNICODE(mapping);
3084 maplen = PyUnicode_GET_SIZE(mapping);
3085 while (s < e) {
3086 unsigned char ch = *s;
3087 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003089 if (ch < maplen)
3090 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003092 if (x == 0xfffe) {
3093 /* undefined mapping */
3094 outpos = p-PyUnicode_AS_UNICODE(v);
3095 startinpos = s-starts;
3096 endinpos = startinpos+1;
3097 if (unicode_decode_call_errorhandler(
3098 errors, &errorHandler,
3099 "charmap", "character maps to <undefined>",
3100 starts, size, &startinpos, &endinpos, &exc, &s,
3101 (PyObject **)&v, &outpos, &p)) {
3102 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003103 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003104 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003105 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003106 *p++ = x;
3107 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003109 }
3110 else {
3111 while (s < e) {
3112 unsigned char ch = *s;
3113 PyObject *w, *x;
3114
3115 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3116 w = PyInt_FromLong((long)ch);
3117 if (w == NULL)
3118 goto onError;
3119 x = PyObject_GetItem(mapping, w);
3120 Py_DECREF(w);
3121 if (x == NULL) {
3122 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3123 /* No mapping found means: mapping is undefined. */
3124 PyErr_Clear();
3125 x = Py_None;
3126 Py_INCREF(x);
3127 } else
3128 goto onError;
3129 }
3130
3131 /* Apply mapping */
3132 if (PyInt_Check(x)) {
3133 long value = PyInt_AS_LONG(x);
3134 if (value < 0 || value > 65535) {
3135 PyErr_SetString(PyExc_TypeError,
3136 "character mapping must be in range(65536)");
3137 Py_DECREF(x);
3138 goto onError;
3139 }
3140 *p++ = (Py_UNICODE)value;
3141 }
3142 else if (x == Py_None) {
3143 /* undefined mapping */
3144 outpos = p-PyUnicode_AS_UNICODE(v);
3145 startinpos = s-starts;
3146 endinpos = startinpos+1;
3147 if (unicode_decode_call_errorhandler(
3148 errors, &errorHandler,
3149 "charmap", "character maps to <undefined>",
3150 starts, size, &startinpos, &endinpos, &exc, &s,
3151 (PyObject **)&v, &outpos, &p)) {
3152 Py_DECREF(x);
3153 goto onError;
3154 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003155 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003156 continue;
3157 }
3158 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003159 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003160
3161 if (targetsize == 1)
3162 /* 1-1 mapping */
3163 *p++ = *PyUnicode_AS_UNICODE(x);
3164
3165 else if (targetsize > 1) {
3166 /* 1-n mapping */
3167 if (targetsize > extrachars) {
3168 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003169 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3170 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003171 (targetsize << 2);
3172 extrachars += needed;
3173 if (_PyUnicode_Resize(&v,
3174 PyUnicode_GET_SIZE(v) + needed) < 0) {
3175 Py_DECREF(x);
3176 goto onError;
3177 }
3178 p = PyUnicode_AS_UNICODE(v) + oldpos;
3179 }
3180 Py_UNICODE_COPY(p,
3181 PyUnicode_AS_UNICODE(x),
3182 targetsize);
3183 p += targetsize;
3184 extrachars -= targetsize;
3185 }
3186 /* 1-0 mapping: skip the character */
3187 }
3188 else {
3189 /* wrong return value */
3190 PyErr_SetString(PyExc_TypeError,
3191 "character mapping must return integer, None or unicode");
3192 Py_DECREF(x);
3193 goto onError;
3194 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003196 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198 }
3199 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003200 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003201 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003202 Py_XDECREF(errorHandler);
3203 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003205
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003207 Py_XDECREF(errorHandler);
3208 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 Py_XDECREF(v);
3210 return NULL;
3211}
3212
Martin v. Löwis3f767792006-06-04 19:36:28 +00003213/* Charmap encoding: the lookup table */
3214
3215struct encoding_map{
3216 PyObject_HEAD
3217 unsigned char level1[32];
3218 int count2, count3;
3219 unsigned char level23[1];
3220};
3221
3222static PyObject*
3223encoding_map_size(PyObject *obj, PyObject* args)
3224{
3225 struct encoding_map *map = (struct encoding_map*)obj;
3226 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3227 128*map->count3);
3228}
3229
3230static PyMethodDef encoding_map_methods[] = {
3231 {"size", encoding_map_size, METH_NOARGS,
3232 PyDoc_STR("Return the size (in bytes) of this object") },
3233 { 0 }
3234};
3235
3236static void
3237encoding_map_dealloc(PyObject* o)
3238{
3239 PyObject_FREE(o);
3240}
3241
3242static PyTypeObject EncodingMapType = {
3243 PyObject_HEAD_INIT(NULL)
3244 0, /*ob_size*/
3245 "EncodingMap", /*tp_name*/
3246 sizeof(struct encoding_map), /*tp_basicsize*/
3247 0, /*tp_itemsize*/
3248 /* methods */
3249 encoding_map_dealloc, /*tp_dealloc*/
3250 0, /*tp_print*/
3251 0, /*tp_getattr*/
3252 0, /*tp_setattr*/
3253 0, /*tp_compare*/
3254 0, /*tp_repr*/
3255 0, /*tp_as_number*/
3256 0, /*tp_as_sequence*/
3257 0, /*tp_as_mapping*/
3258 0, /*tp_hash*/
3259 0, /*tp_call*/
3260 0, /*tp_str*/
3261 0, /*tp_getattro*/
3262 0, /*tp_setattro*/
3263 0, /*tp_as_buffer*/
3264 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3265 0, /*tp_doc*/
3266 0, /*tp_traverse*/
3267 0, /*tp_clear*/
3268 0, /*tp_richcompare*/
3269 0, /*tp_weaklistoffset*/
3270 0, /*tp_iter*/
3271 0, /*tp_iternext*/
3272 encoding_map_methods, /*tp_methods*/
3273 0, /*tp_members*/
3274 0, /*tp_getset*/
3275 0, /*tp_base*/
3276 0, /*tp_dict*/
3277 0, /*tp_descr_get*/
3278 0, /*tp_descr_set*/
3279 0, /*tp_dictoffset*/
3280 0, /*tp_init*/
3281 0, /*tp_alloc*/
3282 0, /*tp_new*/
3283 0, /*tp_free*/
3284 0, /*tp_is_gc*/
3285};
3286
3287PyObject*
3288PyUnicode_BuildEncodingMap(PyObject* string)
3289{
3290 Py_UNICODE *decode;
3291 PyObject *result;
3292 struct encoding_map *mresult;
3293 int i;
3294 int need_dict = 0;
3295 unsigned char level1[32];
3296 unsigned char level2[512];
3297 unsigned char *mlevel1, *mlevel2, *mlevel3;
3298 int count2 = 0, count3 = 0;
3299
3300 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3301 PyErr_BadArgument();
3302 return NULL;
3303 }
3304 decode = PyUnicode_AS_UNICODE(string);
3305 memset(level1, 0xFF, sizeof level1);
3306 memset(level2, 0xFF, sizeof level2);
3307
3308 /* If there isn't a one-to-one mapping of NULL to \0,
3309 or if there are non-BMP characters, we need to use
3310 a mapping dictionary. */
3311 if (decode[0] != 0)
3312 need_dict = 1;
3313 for (i = 1; i < 256; i++) {
3314 int l1, l2;
3315 if (decode[i] == 0
3316 #ifdef Py_UNICODE_WIDE
3317 || decode[i] > 0xFFFF
3318 #endif
3319 ) {
3320 need_dict = 1;
3321 break;
3322 }
3323 if (decode[i] == 0xFFFE)
3324 /* unmapped character */
3325 continue;
3326 l1 = decode[i] >> 11;
3327 l2 = decode[i] >> 7;
3328 if (level1[l1] == 0xFF)
3329 level1[l1] = count2++;
3330 if (level2[l2] == 0xFF)
3331 level2[l2] = count3++;
3332 }
3333
3334 if (count2 >= 0xFF || count3 >= 0xFF)
3335 need_dict = 1;
3336
3337 if (need_dict) {
3338 PyObject *result = PyDict_New();
3339 PyObject *key, *value;
3340 if (!result)
3341 return NULL;
3342 for (i = 0; i < 256; i++) {
3343 key = value = NULL;
3344 key = PyInt_FromLong(decode[i]);
3345 value = PyInt_FromLong(i);
3346 if (!key || !value)
3347 goto failed1;
3348 if (PyDict_SetItem(result, key, value) == -1)
3349 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00003350 Py_DECREF(key);
3351 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003352 }
3353 return result;
3354 failed1:
3355 Py_XDECREF(key);
3356 Py_XDECREF(value);
3357 Py_DECREF(result);
3358 return NULL;
3359 }
3360
3361 /* Create a three-level trie */
3362 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3363 16*count2 + 128*count3 - 1);
3364 if (!result)
3365 return PyErr_NoMemory();
3366 PyObject_Init(result, &EncodingMapType);
3367 mresult = (struct encoding_map*)result;
3368 mresult->count2 = count2;
3369 mresult->count3 = count3;
3370 mlevel1 = mresult->level1;
3371 mlevel2 = mresult->level23;
3372 mlevel3 = mresult->level23 + 16*count2;
3373 memcpy(mlevel1, level1, 32);
3374 memset(mlevel2, 0xFF, 16*count2);
3375 memset(mlevel3, 0, 128*count3);
3376 count3 = 0;
3377 for (i = 1; i < 256; i++) {
3378 int o1, o2, o3, i2, i3;
3379 if (decode[i] == 0xFFFE)
3380 /* unmapped character */
3381 continue;
3382 o1 = decode[i]>>11;
3383 o2 = (decode[i]>>7) & 0xF;
3384 i2 = 16*mlevel1[o1] + o2;
3385 if (mlevel2[i2] == 0xFF)
3386 mlevel2[i2] = count3++;
3387 o3 = decode[i] & 0x7F;
3388 i3 = 128*mlevel2[i2] + o3;
3389 mlevel3[i3] = i;
3390 }
3391 return result;
3392}
3393
3394static int
3395encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3396{
3397 struct encoding_map *map = (struct encoding_map*)mapping;
3398 int l1 = c>>11;
3399 int l2 = (c>>7) & 0xF;
3400 int l3 = c & 0x7F;
3401 int i;
3402
3403#ifdef Py_UNICODE_WIDE
3404 if (c > 0xFFFF) {
3405 return -1;
3406 }
3407#endif
3408 if (c == 0)
3409 return 0;
3410 /* level 1*/
3411 i = map->level1[l1];
3412 if (i == 0xFF) {
3413 return -1;
3414 }
3415 /* level 2*/
3416 i = map->level23[16*i+l2];
3417 if (i == 0xFF) {
3418 return -1;
3419 }
3420 /* level 3 */
3421 i = map->level23[16*map->count2 + 128*i + l3];
3422 if (i == 0) {
3423 return -1;
3424 }
3425 return i;
3426}
3427
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003428/* Lookup the character ch in the mapping. If the character
3429 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003430 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003431static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003432{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003433 PyObject *w = PyInt_FromLong((long)c);
3434 PyObject *x;
3435
3436 if (w == NULL)
3437 return NULL;
3438 x = PyObject_GetItem(mapping, w);
3439 Py_DECREF(w);
3440 if (x == NULL) {
3441 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3442 /* No mapping found means: mapping is undefined. */
3443 PyErr_Clear();
3444 x = Py_None;
3445 Py_INCREF(x);
3446 return x;
3447 } else
3448 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003450 else if (x == Py_None)
3451 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003452 else if (PyInt_Check(x)) {
3453 long value = PyInt_AS_LONG(x);
3454 if (value < 0 || value > 255) {
3455 PyErr_SetString(PyExc_TypeError,
3456 "character mapping must be in range(256)");
3457 Py_DECREF(x);
3458 return NULL;
3459 }
3460 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003462 else if (PyString_Check(x))
3463 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003465 /* wrong return value */
3466 PyErr_SetString(PyExc_TypeError,
3467 "character mapping must return integer, None or str");
3468 Py_DECREF(x);
3469 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 }
3471}
3472
Martin v. Löwis3f767792006-06-04 19:36:28 +00003473static int
3474charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3475{
3476 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3477 /* exponentially overallocate to minimize reallocations */
3478 if (requiredsize < 2*outsize)
3479 requiredsize = 2*outsize;
3480 if (_PyString_Resize(outobj, requiredsize)) {
3481 return 0;
3482 }
3483 return 1;
3484}
3485
3486typedef enum charmapencode_result {
3487 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3488}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003489/* lookup the character, put the result in the output string and adjust
3490 various state variables. Reallocate the output string if not enough
3491 space is available. Return a new reference to the object that
3492 was put in the output buffer, or Py_None, if the mapping was undefined
3493 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003494 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003495static
Martin v. Löwis3f767792006-06-04 19:36:28 +00003496charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003497 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498{
Martin v. Löwis3f767792006-06-04 19:36:28 +00003499 PyObject *rep;
3500 char *outstart;
3501 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502
Martin v. Löwis3f767792006-06-04 19:36:28 +00003503 if (mapping->ob_type == &EncodingMapType) {
3504 int res = encoding_map_lookup(c, mapping);
3505 Py_ssize_t requiredsize = *outpos+1;
3506 if (res == -1)
3507 return enc_FAILED;
3508 if (outsize<requiredsize)
3509 if (!charmapencode_resize(outobj, outpos, requiredsize))
3510 return enc_EXCEPTION;
3511 outstart = PyString_AS_STRING(*outobj);
3512 outstart[(*outpos)++] = (char)res;
3513 return enc_SUCCESS;
3514 }
3515
3516 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003517 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003518 return enc_EXCEPTION;
3519 else if (rep==Py_None) {
3520 Py_DECREF(rep);
3521 return enc_FAILED;
3522 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003524 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003525 if (outsize<requiredsize)
3526 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003528 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003530 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3532 }
3533 else {
3534 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003535 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3536 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003537 if (outsize<requiredsize)
3538 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003540 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003542 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 memcpy(outstart + *outpos, repchars, repsize);
3544 *outpos += repsize;
3545 }
3546 }
Georg Brandl9f167602006-06-04 21:46:16 +00003547 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003548 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003549}
3550
3551/* handle an error in PyUnicode_EncodeCharmap
3552 Return 0 on success, -1 on error */
3553static
3554int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003555 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003557 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003558 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559{
3560 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003561 Py_ssize_t repsize;
3562 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563 Py_UNICODE *uni2;
3564 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003565 Py_ssize_t collstartpos = *inpos;
3566 Py_ssize_t collendpos = *inpos+1;
3567 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003568 char *encoding = "charmap";
3569 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00003570 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572 /* find all unencodable characters */
3573 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003574 PyObject *rep;
3575 if (mapping->ob_type == &EncodingMapType) {
3576 int res = encoding_map_lookup(p[collendpos], mapping);
3577 if (res != -1)
3578 break;
3579 ++collendpos;
3580 continue;
3581 }
3582
3583 rep = charmapencode_lookup(p[collendpos], mapping);
3584 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003586 else if (rep!=Py_None) {
3587 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 break;
3589 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003590 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003591 ++collendpos;
3592 }
3593 /* cache callback name lookup
3594 * (if not done yet, i.e. it's the first error) */
3595 if (*known_errorHandler==-1) {
3596 if ((errors==NULL) || (!strcmp(errors, "strict")))
3597 *known_errorHandler = 1;
3598 else if (!strcmp(errors, "replace"))
3599 *known_errorHandler = 2;
3600 else if (!strcmp(errors, "ignore"))
3601 *known_errorHandler = 3;
3602 else if (!strcmp(errors, "xmlcharrefreplace"))
3603 *known_errorHandler = 4;
3604 else
3605 *known_errorHandler = 0;
3606 }
3607 switch (*known_errorHandler) {
3608 case 1: /* strict */
3609 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3610 return -1;
3611 case 2: /* replace */
3612 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3613 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003614 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003615 return -1;
3616 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003617 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003618 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3619 return -1;
3620 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621 }
3622 /* fall through */
3623 case 3: /* ignore */
3624 *inpos = collendpos;
3625 break;
3626 case 4: /* xmlcharrefreplace */
3627 /* generate replacement (temporarily (mis)uses p) */
3628 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3629 char buffer[2+29+1+1];
3630 char *cp;
3631 sprintf(buffer, "&#%d;", (int)p[collpos]);
3632 for (cp = buffer; *cp; ++cp) {
3633 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003634 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003636 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3638 return -1;
3639 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640 }
3641 }
3642 *inpos = collendpos;
3643 break;
3644 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003645 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003646 encoding, reason, p, size, exceptionObject,
3647 collstartpos, collendpos, &newpos);
3648 if (repunicode == NULL)
3649 return -1;
3650 /* generate replacement */
3651 repsize = PyUnicode_GET_SIZE(repunicode);
3652 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3653 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003654 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655 return -1;
3656 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003657 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003659 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3660 return -1;
3661 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003662 }
3663 *inpos = newpos;
3664 Py_DECREF(repunicode);
3665 }
3666 return 0;
3667}
3668
Guido van Rossumd57fd912000-03-10 22:53:23 +00003669PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003670 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671 PyObject *mapping,
3672 const char *errors)
3673{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003674 /* output object */
3675 PyObject *res = NULL;
3676 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003677 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003678 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003679 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 PyObject *errorHandler = NULL;
3681 PyObject *exc = NULL;
3682 /* the following variable is used for caching string comparisons
3683 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3684 * 3=ignore, 4=xmlcharrefreplace */
3685 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686
3687 /* Default to Latin-1 */
3688 if (mapping == NULL)
3689 return PyUnicode_EncodeLatin1(p, size, errors);
3690
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003691 /* allocate enough for a simple encoding without
3692 replacements, if we need more, we'll resize */
3693 res = PyString_FromStringAndSize(NULL, size);
3694 if (res == NULL)
3695 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003696 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003697 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003699 while (inpos<size) {
3700 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00003701 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3702 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003704 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003705 if (charmap_encoding_error(p, size, &inpos, mapping,
3706 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003707 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003708 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003709 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003712 else
3713 /* done with this character => adjust input position */
3714 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003716
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003717 /* Resize if we allocated to much */
3718 if (respos<PyString_GET_SIZE(res)) {
3719 if (_PyString_Resize(&res, respos))
3720 goto onError;
3721 }
3722 Py_XDECREF(exc);
3723 Py_XDECREF(errorHandler);
3724 return res;
3725
3726 onError:
3727 Py_XDECREF(res);
3728 Py_XDECREF(exc);
3729 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730 return NULL;
3731}
3732
3733PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3734 PyObject *mapping)
3735{
3736 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3737 PyErr_BadArgument();
3738 return NULL;
3739 }
3740 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3741 PyUnicode_GET_SIZE(unicode),
3742 mapping,
3743 NULL);
3744}
3745
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003746/* create or adjust a UnicodeTranslateError */
3747static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003748 const Py_UNICODE *unicode, Py_ssize_t size,
3749 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003750 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003752 if (*exceptionObject == NULL) {
3753 *exceptionObject = PyUnicodeTranslateError_Create(
3754 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755 }
3756 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003757 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3758 goto onError;
3759 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3760 goto onError;
3761 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3762 goto onError;
3763 return;
3764 onError:
3765 Py_DECREF(*exceptionObject);
3766 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003767 }
3768}
3769
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003770/* raises a UnicodeTranslateError */
3771static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003772 const Py_UNICODE *unicode, Py_ssize_t size,
3773 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003774 const char *reason)
3775{
3776 make_translate_exception(exceptionObject,
3777 unicode, size, startpos, endpos, reason);
3778 if (*exceptionObject != NULL)
3779 PyCodec_StrictErrors(*exceptionObject);
3780}
3781
3782/* error handling callback helper:
3783 build arguments, call the callback and check the arguments,
3784 put the result into newpos and return the replacement string, which
3785 has to be freed by the caller */
3786static PyObject *unicode_translate_call_errorhandler(const char *errors,
3787 PyObject **errorHandler,
3788 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003789 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3790 Py_ssize_t startpos, Py_ssize_t endpos,
3791 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003792{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003793 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003794
Martin v. Löwis412fb672006-04-13 06:34:32 +00003795 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796 PyObject *restuple;
3797 PyObject *resunicode;
3798
3799 if (*errorHandler == NULL) {
3800 *errorHandler = PyCodec_LookupError(errors);
3801 if (*errorHandler == NULL)
3802 return NULL;
3803 }
3804
3805 make_translate_exception(exceptionObject,
3806 unicode, size, startpos, endpos, reason);
3807 if (*exceptionObject == NULL)
3808 return NULL;
3809
3810 restuple = PyObject_CallFunctionObjArgs(
3811 *errorHandler, *exceptionObject, NULL);
3812 if (restuple == NULL)
3813 return NULL;
3814 if (!PyTuple_Check(restuple)) {
3815 PyErr_Format(PyExc_TypeError, &argparse[4]);
3816 Py_DECREF(restuple);
3817 return NULL;
3818 }
3819 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003820 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003821 Py_DECREF(restuple);
3822 return NULL;
3823 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003824 if (i_newpos<0)
3825 *newpos = size+i_newpos;
3826 else
3827 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003828 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003829 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003830 Py_DECREF(restuple);
3831 return NULL;
3832 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003833 Py_INCREF(resunicode);
3834 Py_DECREF(restuple);
3835 return resunicode;
3836}
3837
3838/* Lookup the character ch in the mapping and put the result in result,
3839 which must be decrefed by the caller.
3840 Return 0 on success, -1 on error */
3841static
3842int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3843{
3844 PyObject *w = PyInt_FromLong((long)c);
3845 PyObject *x;
3846
3847 if (w == NULL)
3848 return -1;
3849 x = PyObject_GetItem(mapping, w);
3850 Py_DECREF(w);
3851 if (x == NULL) {
3852 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3853 /* No mapping found means: use 1:1 mapping. */
3854 PyErr_Clear();
3855 *result = NULL;
3856 return 0;
3857 } else
3858 return -1;
3859 }
3860 else if (x == Py_None) {
3861 *result = x;
3862 return 0;
3863 }
3864 else if (PyInt_Check(x)) {
3865 long value = PyInt_AS_LONG(x);
3866 long max = PyUnicode_GetMax();
3867 if (value < 0 || value > max) {
3868 PyErr_Format(PyExc_TypeError,
3869 "character mapping must be in range(0x%lx)", max+1);
3870 Py_DECREF(x);
3871 return -1;
3872 }
3873 *result = x;
3874 return 0;
3875 }
3876 else if (PyUnicode_Check(x)) {
3877 *result = x;
3878 return 0;
3879 }
3880 else {
3881 /* wrong return value */
3882 PyErr_SetString(PyExc_TypeError,
3883 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003884 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003885 return -1;
3886 }
3887}
3888/* ensure that *outobj is at least requiredsize characters long,
3889if not reallocate and adjust various state variables.
3890Return 0 on success, -1 on error */
3891static
Walter Dörwald4894c302003-10-24 14:25:28 +00003892int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003893 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003894{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003895 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003896 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003897 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003898 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003899 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003900 if (requiredsize < 2 * oldsize)
3901 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003902 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003903 return -1;
3904 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003905 }
3906 return 0;
3907}
3908/* lookup the character, put the result in the output string and adjust
3909 various state variables. Return a new reference to the object that
3910 was put in the output buffer in *result, or Py_None, if the mapping was
3911 undefined (in which case no character was written).
3912 The called must decref result.
3913 Return 0 on success, -1 on error. */
3914static
Walter Dörwald4894c302003-10-24 14:25:28 +00003915int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003916 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003917 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003918{
Walter Dörwald4894c302003-10-24 14:25:28 +00003919 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003920 return -1;
3921 if (*res==NULL) {
3922 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003923 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003924 }
3925 else if (*res==Py_None)
3926 ;
3927 else if (PyInt_Check(*res)) {
3928 /* no overflow check, because we know that the space is enough */
3929 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3930 }
3931 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003932 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003933 if (repsize==1) {
3934 /* no overflow check, because we know that the space is enough */
3935 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3936 }
3937 else if (repsize!=0) {
3938 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003939 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003940 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003941 repsize - 1;
3942 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003943 return -1;
3944 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3945 *outp += repsize;
3946 }
3947 }
3948 else
3949 return -1;
3950 return 0;
3951}
3952
3953PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003954 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003955 PyObject *mapping,
3956 const char *errors)
3957{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958 /* output object */
3959 PyObject *res = NULL;
3960 /* pointers to the beginning and end+1 of input */
3961 const Py_UNICODE *startp = p;
3962 const Py_UNICODE *endp = p + size;
3963 /* pointer into the output */
3964 Py_UNICODE *str;
3965 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003966 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003967 char *reason = "character maps to <undefined>";
3968 PyObject *errorHandler = NULL;
3969 PyObject *exc = NULL;
3970 /* the following variable is used for caching string comparisons
3971 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3972 * 3=ignore, 4=xmlcharrefreplace */
3973 int known_errorHandler = -1;
3974
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 if (mapping == NULL) {
3976 PyErr_BadArgument();
3977 return NULL;
3978 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003979
3980 /* allocate enough for a simple 1:1 translation without
3981 replacements, if we need more, we'll resize */
3982 res = PyUnicode_FromUnicode(NULL, size);
3983 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003984 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003986 return res;
3987 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003989 while (p<endp) {
3990 /* try to encode it */
3991 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003992 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003993 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003994 goto onError;
3995 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003996 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003997 if (x!=Py_None) /* it worked => adjust input pointer */
3998 ++p;
3999 else { /* untranslatable character */
4000 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004001 Py_ssize_t repsize;
4002 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003 Py_UNICODE *uni2;
4004 /* startpos for collecting untranslatable chars */
4005 const Py_UNICODE *collstart = p;
4006 const Py_UNICODE *collend = p+1;
4007 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004009 /* find all untranslatable characters */
4010 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004011 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004012 goto onError;
4013 Py_XDECREF(x);
4014 if (x!=Py_None)
4015 break;
4016 ++collend;
4017 }
4018 /* cache callback name lookup
4019 * (if not done yet, i.e. it's the first error) */
4020 if (known_errorHandler==-1) {
4021 if ((errors==NULL) || (!strcmp(errors, "strict")))
4022 known_errorHandler = 1;
4023 else if (!strcmp(errors, "replace"))
4024 known_errorHandler = 2;
4025 else if (!strcmp(errors, "ignore"))
4026 known_errorHandler = 3;
4027 else if (!strcmp(errors, "xmlcharrefreplace"))
4028 known_errorHandler = 4;
4029 else
4030 known_errorHandler = 0;
4031 }
4032 switch (known_errorHandler) {
4033 case 1: /* strict */
4034 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4035 goto onError;
4036 case 2: /* replace */
4037 /* No need to check for space, this is a 1:1 replacement */
4038 for (coll = collstart; coll<collend; ++coll)
4039 *str++ = '?';
4040 /* fall through */
4041 case 3: /* ignore */
4042 p = collend;
4043 break;
4044 case 4: /* xmlcharrefreplace */
4045 /* generate replacement (temporarily (mis)uses p) */
4046 for (p = collstart; p < collend; ++p) {
4047 char buffer[2+29+1+1];
4048 char *cp;
4049 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004050 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004051 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4052 goto onError;
4053 for (cp = buffer; *cp; ++cp)
4054 *str++ = *cp;
4055 }
4056 p = collend;
4057 break;
4058 default:
4059 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4060 reason, startp, size, &exc,
4061 collstart-startp, collend-startp, &newpos);
4062 if (repunicode == NULL)
4063 goto onError;
4064 /* generate replacement */
4065 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004066 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4068 Py_DECREF(repunicode);
4069 goto onError;
4070 }
4071 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4072 *str++ = *uni2;
4073 p = startp + newpos;
4074 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075 }
4076 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004078 /* Resize if we allocated to much */
4079 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004080 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004081 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004082 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083 }
4084 Py_XDECREF(exc);
4085 Py_XDECREF(errorHandler);
4086 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004088 onError:
4089 Py_XDECREF(res);
4090 Py_XDECREF(exc);
4091 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092 return NULL;
4093}
4094
4095PyObject *PyUnicode_Translate(PyObject *str,
4096 PyObject *mapping,
4097 const char *errors)
4098{
4099 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004100
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101 str = PyUnicode_FromObject(str);
4102 if (str == NULL)
4103 goto onError;
4104 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4105 PyUnicode_GET_SIZE(str),
4106 mapping,
4107 errors);
4108 Py_DECREF(str);
4109 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004110
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111 onError:
4112 Py_XDECREF(str);
4113 return NULL;
4114}
Tim Petersced69f82003-09-16 20:30:58 +00004115
Guido van Rossum9e896b32000-04-05 20:11:21 +00004116/* --- Decimal Encoder ---------------------------------------------------- */
4117
4118int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004119 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004120 char *output,
4121 const char *errors)
4122{
4123 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004124 PyObject *errorHandler = NULL;
4125 PyObject *exc = NULL;
4126 const char *encoding = "decimal";
4127 const char *reason = "invalid decimal Unicode string";
4128 /* the following variable is used for caching string comparisons
4129 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4130 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004131
4132 if (output == NULL) {
4133 PyErr_BadArgument();
4134 return -1;
4135 }
4136
4137 p = s;
4138 end = s + length;
4139 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004140 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004141 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004142 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004143 Py_ssize_t repsize;
4144 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145 Py_UNICODE *uni2;
4146 Py_UNICODE *collstart;
4147 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004148
Guido van Rossum9e896b32000-04-05 20:11:21 +00004149 if (Py_UNICODE_ISSPACE(ch)) {
4150 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004151 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004152 continue;
4153 }
4154 decimal = Py_UNICODE_TODECIMAL(ch);
4155 if (decimal >= 0) {
4156 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004158 continue;
4159 }
Guido van Rossumba477042000-04-06 18:18:10 +00004160 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004161 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004163 continue;
4164 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004165 /* All other characters are considered unencodable */
4166 collstart = p;
4167 collend = p+1;
4168 while (collend < end) {
4169 if ((0 < *collend && *collend < 256) ||
4170 !Py_UNICODE_ISSPACE(*collend) ||
4171 Py_UNICODE_TODECIMAL(*collend))
4172 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004173 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004174 /* cache callback name lookup
4175 * (if not done yet, i.e. it's the first error) */
4176 if (known_errorHandler==-1) {
4177 if ((errors==NULL) || (!strcmp(errors, "strict")))
4178 known_errorHandler = 1;
4179 else if (!strcmp(errors, "replace"))
4180 known_errorHandler = 2;
4181 else if (!strcmp(errors, "ignore"))
4182 known_errorHandler = 3;
4183 else if (!strcmp(errors, "xmlcharrefreplace"))
4184 known_errorHandler = 4;
4185 else
4186 known_errorHandler = 0;
4187 }
4188 switch (known_errorHandler) {
4189 case 1: /* strict */
4190 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4191 goto onError;
4192 case 2: /* replace */
4193 for (p = collstart; p < collend; ++p)
4194 *output++ = '?';
4195 /* fall through */
4196 case 3: /* ignore */
4197 p = collend;
4198 break;
4199 case 4: /* xmlcharrefreplace */
4200 /* generate replacement (temporarily (mis)uses p) */
4201 for (p = collstart; p < collend; ++p)
4202 output += sprintf(output, "&#%d;", (int)*p);
4203 p = collend;
4204 break;
4205 default:
4206 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4207 encoding, reason, s, length, &exc,
4208 collstart-s, collend-s, &newpos);
4209 if (repunicode == NULL)
4210 goto onError;
4211 /* generate replacement */
4212 repsize = PyUnicode_GET_SIZE(repunicode);
4213 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4214 Py_UNICODE ch = *uni2;
4215 if (Py_UNICODE_ISSPACE(ch))
4216 *output++ = ' ';
4217 else {
4218 decimal = Py_UNICODE_TODECIMAL(ch);
4219 if (decimal >= 0)
4220 *output++ = '0' + decimal;
4221 else if (0 < ch && ch < 256)
4222 *output++ = (char)ch;
4223 else {
4224 Py_DECREF(repunicode);
4225 raise_encode_exception(&exc, encoding,
4226 s, length, collstart-s, collend-s, reason);
4227 goto onError;
4228 }
4229 }
4230 }
4231 p = s + newpos;
4232 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004233 }
4234 }
4235 /* 0-terminate the output string */
4236 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004237 Py_XDECREF(exc);
4238 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004239 return 0;
4240
4241 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004242 Py_XDECREF(exc);
4243 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004244 return -1;
4245}
4246
Guido van Rossumd57fd912000-03-10 22:53:23 +00004247/* --- Helpers ------------------------------------------------------------ */
4248
Fredrik Lundha50d2012006-05-26 17:04:58 +00004249#define STRINGLIB_CHAR Py_UNICODE
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004250
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004251#define STRINGLIB_LEN PyUnicode_GET_SIZE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004252#define STRINGLIB_NEW PyUnicode_FromUnicode
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004253#define STRINGLIB_STR PyUnicode_AS_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004254
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004255Py_LOCAL_INLINE(int)
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004256STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4257{
Fredrik Lundh9c0e9c02006-05-26 18:24:15 +00004258 if (str[0] != other[0])
4259 return 1;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004260 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4261}
4262
Fredrik Lundhb9479482006-05-26 17:22:38 +00004263#define STRINGLIB_EMPTY unicode_empty
4264
Fredrik Lundha50d2012006-05-26 17:04:58 +00004265#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004266
4267#include "stringlib/count.h"
4268#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00004269#include "stringlib/partition.h"
4270
Fredrik Lundhc8162812006-05-26 19:33:03 +00004271/* helper macro to fixup start/end slice values */
4272#define FIX_START_END(obj) \
4273 if (start < 0) \
4274 start += (obj)->length; \
4275 if (start < 0) \
4276 start = 0; \
4277 if (end > (obj)->length) \
4278 end = (obj)->length; \
4279 if (end < 0) \
4280 end += (obj)->length; \
4281 if (end < 0) \
4282 end = 0;
4283
Martin v. Löwis18e16552006-02-15 17:27:45 +00004284Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004285 PyObject *substr,
4286 Py_ssize_t start,
4287 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004289 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004290 PyUnicodeObject* str_obj;
4291 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004292
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004293 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4294 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004295 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004296 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4297 if (!sub_obj) {
4298 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299 return -1;
4300 }
Tim Petersced69f82003-09-16 20:30:58 +00004301
Fredrik Lundhc8162812006-05-26 19:33:03 +00004302 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004303
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004304 result = stringlib_count(
4305 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4306 );
4307
4308 Py_DECREF(sub_obj);
4309 Py_DECREF(str_obj);
4310
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311 return result;
4312}
4313
Martin v. Löwis18e16552006-02-15 17:27:45 +00004314Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004315 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004316 Py_ssize_t start,
4317 Py_ssize_t end,
4318 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004320 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004321
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004322 str = PyUnicode_FromObject(str);
4323 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004324 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004325 sub = PyUnicode_FromObject(sub);
4326 if (!sub) {
4327 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004328 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004329 }
Tim Petersced69f82003-09-16 20:30:58 +00004330
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004331 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004332 result = stringlib_find_slice(
4333 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4334 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4335 start, end
4336 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004337 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004338 result = stringlib_rfind_slice(
4339 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4340 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4341 start, end
4342 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004343
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004344 Py_DECREF(str);
4345 Py_DECREF(sub);
4346
Guido van Rossumd57fd912000-03-10 22:53:23 +00004347 return result;
4348}
4349
Tim Petersced69f82003-09-16 20:30:58 +00004350static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351int tailmatch(PyUnicodeObject *self,
4352 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004353 Py_ssize_t start,
4354 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355 int direction)
4356{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357 if (substring->length == 0)
4358 return 1;
4359
Fredrik Lundhc8162812006-05-26 19:33:03 +00004360 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361
4362 end -= substring->length;
4363 if (end < start)
4364 return 0;
4365
4366 if (direction > 0) {
4367 if (Py_UNICODE_MATCH(self, end, substring))
4368 return 1;
4369 } else {
4370 if (Py_UNICODE_MATCH(self, start, substring))
4371 return 1;
4372 }
4373
4374 return 0;
4375}
4376
Martin v. Löwis18e16552006-02-15 17:27:45 +00004377Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004379 Py_ssize_t start,
4380 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381 int direction)
4382{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004383 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004384
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385 str = PyUnicode_FromObject(str);
4386 if (str == NULL)
4387 return -1;
4388 substr = PyUnicode_FromObject(substr);
4389 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004390 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391 return -1;
4392 }
Tim Petersced69f82003-09-16 20:30:58 +00004393
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394 result = tailmatch((PyUnicodeObject *)str,
4395 (PyUnicodeObject *)substr,
4396 start, end, direction);
4397 Py_DECREF(str);
4398 Py_DECREF(substr);
4399 return result;
4400}
4401
Guido van Rossumd57fd912000-03-10 22:53:23 +00004402/* Apply fixfct filter to the Unicode object self and return a
4403 reference to the modified object */
4404
Tim Petersced69f82003-09-16 20:30:58 +00004405static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004406PyObject *fixup(PyUnicodeObject *self,
4407 int (*fixfct)(PyUnicodeObject *s))
4408{
4409
4410 PyUnicodeObject *u;
4411
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004412 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413 if (u == NULL)
4414 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004415
4416 Py_UNICODE_COPY(u->str, self->str, self->length);
4417
Tim Peters7a29bd52001-09-12 03:03:31 +00004418 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419 /* fixfct should return TRUE if it modified the buffer. If
4420 FALSE, return a reference to the original buffer instead
4421 (to save space, not time) */
4422 Py_INCREF(self);
4423 Py_DECREF(u);
4424 return (PyObject*) self;
4425 }
4426 return (PyObject*) u;
4427}
4428
Tim Petersced69f82003-09-16 20:30:58 +00004429static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004430int fixupper(PyUnicodeObject *self)
4431{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004432 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433 Py_UNICODE *s = self->str;
4434 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004435
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436 while (len-- > 0) {
4437 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004438
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439 ch = Py_UNICODE_TOUPPER(*s);
4440 if (ch != *s) {
4441 status = 1;
4442 *s = ch;
4443 }
4444 s++;
4445 }
4446
4447 return status;
4448}
4449
Tim Petersced69f82003-09-16 20:30:58 +00004450static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451int fixlower(PyUnicodeObject *self)
4452{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004453 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454 Py_UNICODE *s = self->str;
4455 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004456
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457 while (len-- > 0) {
4458 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004459
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 ch = Py_UNICODE_TOLOWER(*s);
4461 if (ch != *s) {
4462 status = 1;
4463 *s = ch;
4464 }
4465 s++;
4466 }
4467
4468 return status;
4469}
4470
Tim Petersced69f82003-09-16 20:30:58 +00004471static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472int fixswapcase(PyUnicodeObject *self)
4473{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004474 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475 Py_UNICODE *s = self->str;
4476 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004477
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478 while (len-- > 0) {
4479 if (Py_UNICODE_ISUPPER(*s)) {
4480 *s = Py_UNICODE_TOLOWER(*s);
4481 status = 1;
4482 } else if (Py_UNICODE_ISLOWER(*s)) {
4483 *s = Py_UNICODE_TOUPPER(*s);
4484 status = 1;
4485 }
4486 s++;
4487 }
4488
4489 return status;
4490}
4491
Tim Petersced69f82003-09-16 20:30:58 +00004492static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493int fixcapitalize(PyUnicodeObject *self)
4494{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004495 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004496 Py_UNICODE *s = self->str;
4497 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004498
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004499 if (len == 0)
4500 return 0;
4501 if (Py_UNICODE_ISLOWER(*s)) {
4502 *s = Py_UNICODE_TOUPPER(*s);
4503 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004505 s++;
4506 while (--len > 0) {
4507 if (Py_UNICODE_ISUPPER(*s)) {
4508 *s = Py_UNICODE_TOLOWER(*s);
4509 status = 1;
4510 }
4511 s++;
4512 }
4513 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004514}
4515
4516static
4517int fixtitle(PyUnicodeObject *self)
4518{
4519 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4520 register Py_UNICODE *e;
4521 int previous_is_cased;
4522
4523 /* Shortcut for single character strings */
4524 if (PyUnicode_GET_SIZE(self) == 1) {
4525 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4526 if (*p != ch) {
4527 *p = ch;
4528 return 1;
4529 }
4530 else
4531 return 0;
4532 }
Tim Petersced69f82003-09-16 20:30:58 +00004533
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534 e = p + PyUnicode_GET_SIZE(self);
4535 previous_is_cased = 0;
4536 for (; p < e; p++) {
4537 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004538
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539 if (previous_is_cased)
4540 *p = Py_UNICODE_TOLOWER(ch);
4541 else
4542 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004543
4544 if (Py_UNICODE_ISLOWER(ch) ||
4545 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004546 Py_UNICODE_ISTITLE(ch))
4547 previous_is_cased = 1;
4548 else
4549 previous_is_cased = 0;
4550 }
4551 return 1;
4552}
4553
Tim Peters8ce9f162004-08-27 01:49:32 +00004554PyObject *
4555PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004556{
Tim Peters8ce9f162004-08-27 01:49:32 +00004557 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004558 const Py_UNICODE blank = ' ';
4559 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004560 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004561 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004562 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4563 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004564 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4565 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004566 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004567 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004568 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004569
Tim Peters05eba1f2004-08-27 21:32:02 +00004570 fseq = PySequence_Fast(seq, "");
4571 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004572 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004573 }
4574
Tim Peters91879ab2004-08-27 22:35:44 +00004575 /* Grrrr. A codec may be invoked to convert str objects to
4576 * Unicode, and so it's possible to call back into Python code
4577 * during PyUnicode_FromObject(), and so it's possible for a sick
4578 * codec to change the size of fseq (if seq is a list). Therefore
4579 * we have to keep refetching the size -- can't assume seqlen
4580 * is invariant.
4581 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004582 seqlen = PySequence_Fast_GET_SIZE(fseq);
4583 /* If empty sequence, return u"". */
4584 if (seqlen == 0) {
4585 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4586 goto Done;
4587 }
4588 /* If singleton sequence with an exact Unicode, return that. */
4589 if (seqlen == 1) {
4590 item = PySequence_Fast_GET_ITEM(fseq, 0);
4591 if (PyUnicode_CheckExact(item)) {
4592 Py_INCREF(item);
4593 res = (PyUnicodeObject *)item;
4594 goto Done;
4595 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004596 }
4597
Tim Peters05eba1f2004-08-27 21:32:02 +00004598 /* At least two items to join, or one that isn't exact Unicode. */
4599 if (seqlen > 1) {
4600 /* Set up sep and seplen -- they're needed. */
4601 if (separator == NULL) {
4602 sep = &blank;
4603 seplen = 1;
4604 }
4605 else {
4606 internal_separator = PyUnicode_FromObject(separator);
4607 if (internal_separator == NULL)
4608 goto onError;
4609 sep = PyUnicode_AS_UNICODE(internal_separator);
4610 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004611 /* In case PyUnicode_FromObject() mutated seq. */
4612 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004613 }
4614 }
4615
4616 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004617 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004618 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004619 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004620 res_p = PyUnicode_AS_UNICODE(res);
4621 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004622
Tim Peters05eba1f2004-08-27 21:32:02 +00004623 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004624 Py_ssize_t itemlen;
4625 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004626
4627 item = PySequence_Fast_GET_ITEM(fseq, i);
4628 /* Convert item to Unicode. */
4629 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4630 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004631 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004632 " %.80s found",
4633 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004634 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004635 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004636 item = PyUnicode_FromObject(item);
4637 if (item == NULL)
4638 goto onError;
4639 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004640
Tim Peters91879ab2004-08-27 22:35:44 +00004641 /* In case PyUnicode_FromObject() mutated seq. */
4642 seqlen = PySequence_Fast_GET_SIZE(fseq);
4643
Tim Peters8ce9f162004-08-27 01:49:32 +00004644 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004646 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004647 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004648 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004649 if (i < seqlen - 1) {
4650 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004651 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004652 goto Overflow;
4653 }
4654 if (new_res_used > res_alloc) {
4655 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004656 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004657 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004658 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004659 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004660 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004661 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004662 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004664 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004665 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004666 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004667
4668 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004669 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004670 res_p += itemlen;
4671 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004672 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004673 res_p += seplen;
4674 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004675 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004676 res_used = new_res_used;
4677 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004678
Tim Peters05eba1f2004-08-27 21:32:02 +00004679 /* Shrink res to match the used area; this probably can't fail,
4680 * but it's cheap to check.
4681 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004682 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004683 goto onError;
4684
4685 Done:
4686 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004687 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688 return (PyObject *)res;
4689
Tim Peters8ce9f162004-08-27 01:49:32 +00004690 Overflow:
4691 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00004692 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004693 Py_DECREF(item);
4694 /* fall through */
4695
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004697 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004698 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004699 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004700 return NULL;
4701}
4702
Tim Petersced69f82003-09-16 20:30:58 +00004703static
4704PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004705 Py_ssize_t left,
4706 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707 Py_UNICODE fill)
4708{
4709 PyUnicodeObject *u;
4710
4711 if (left < 0)
4712 left = 0;
4713 if (right < 0)
4714 right = 0;
4715
Tim Peters7a29bd52001-09-12 03:03:31 +00004716 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004717 Py_INCREF(self);
4718 return self;
4719 }
4720
4721 u = _PyUnicode_New(left + self->length + right);
4722 if (u) {
4723 if (left)
4724 Py_UNICODE_FILL(u->str, fill, left);
4725 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4726 if (right)
4727 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4728 }
4729
4730 return u;
4731}
4732
4733#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004734 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735 if (!str) \
4736 goto onError; \
4737 if (PyList_Append(list, str)) { \
4738 Py_DECREF(str); \
4739 goto onError; \
4740 } \
4741 else \
4742 Py_DECREF(str);
4743
4744static
4745PyObject *split_whitespace(PyUnicodeObject *self,
4746 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004747 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004749 register Py_ssize_t i;
4750 register Py_ssize_t j;
4751 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752 PyObject *str;
4753
4754 for (i = j = 0; i < len; ) {
4755 /* find a token */
4756 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4757 i++;
4758 j = i;
4759 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4760 i++;
4761 if (j < i) {
4762 if (maxcount-- <= 0)
4763 break;
4764 SPLIT_APPEND(self->str, j, i);
4765 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4766 i++;
4767 j = i;
4768 }
4769 }
4770 if (j < len) {
4771 SPLIT_APPEND(self->str, j, len);
4772 }
4773 return list;
4774
4775 onError:
4776 Py_DECREF(list);
4777 return NULL;
4778}
4779
4780PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004781 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004783 register Py_ssize_t i;
4784 register Py_ssize_t j;
4785 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786 PyObject *list;
4787 PyObject *str;
4788 Py_UNICODE *data;
4789
4790 string = PyUnicode_FromObject(string);
4791 if (string == NULL)
4792 return NULL;
4793 data = PyUnicode_AS_UNICODE(string);
4794 len = PyUnicode_GET_SIZE(string);
4795
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796 list = PyList_New(0);
4797 if (!list)
4798 goto onError;
4799
4800 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004801 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004802
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004804 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806
4807 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004808 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809 if (i < len) {
4810 if (data[i] == '\r' && i + 1 < len &&
4811 data[i+1] == '\n')
4812 i += 2;
4813 else
4814 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004815 if (keepends)
4816 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 }
Guido van Rossum86662912000-04-11 15:38:46 +00004818 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819 j = i;
4820 }
4821 if (j < len) {
4822 SPLIT_APPEND(data, j, len);
4823 }
4824
4825 Py_DECREF(string);
4826 return list;
4827
4828 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004829 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830 Py_DECREF(string);
4831 return NULL;
4832}
4833
Tim Petersced69f82003-09-16 20:30:58 +00004834static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835PyObject *split_char(PyUnicodeObject *self,
4836 PyObject *list,
4837 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004838 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004840 register Py_ssize_t i;
4841 register Py_ssize_t j;
4842 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 PyObject *str;
4844
4845 for (i = j = 0; i < len; ) {
4846 if (self->str[i] == ch) {
4847 if (maxcount-- <= 0)
4848 break;
4849 SPLIT_APPEND(self->str, j, i);
4850 i = j = i + 1;
4851 } else
4852 i++;
4853 }
4854 if (j <= len) {
4855 SPLIT_APPEND(self->str, j, len);
4856 }
4857 return list;
4858
4859 onError:
4860 Py_DECREF(list);
4861 return NULL;
4862}
4863
Tim Petersced69f82003-09-16 20:30:58 +00004864static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865PyObject *split_substring(PyUnicodeObject *self,
4866 PyObject *list,
4867 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004868 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004870 register Py_ssize_t i;
4871 register Py_ssize_t j;
4872 Py_ssize_t len = self->length;
4873 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874 PyObject *str;
4875
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004876 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 if (Py_UNICODE_MATCH(self, i, substring)) {
4878 if (maxcount-- <= 0)
4879 break;
4880 SPLIT_APPEND(self->str, j, i);
4881 i = j = i + sublen;
4882 } else
4883 i++;
4884 }
4885 if (j <= len) {
4886 SPLIT_APPEND(self->str, j, len);
4887 }
4888 return list;
4889
4890 onError:
4891 Py_DECREF(list);
4892 return NULL;
4893}
4894
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004895static
4896PyObject *rsplit_whitespace(PyUnicodeObject *self,
4897 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004898 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004899{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004900 register Py_ssize_t i;
4901 register Py_ssize_t j;
4902 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004903 PyObject *str;
4904
4905 for (i = j = len - 1; i >= 0; ) {
4906 /* find a token */
4907 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4908 i--;
4909 j = i;
4910 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4911 i--;
4912 if (j > i) {
4913 if (maxcount-- <= 0)
4914 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004915 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004916 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4917 i--;
4918 j = i;
4919 }
4920 }
4921 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004922 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004923 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004924 if (PyList_Reverse(list) < 0)
4925 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004926 return list;
4927
4928 onError:
4929 Py_DECREF(list);
4930 return NULL;
4931}
4932
4933static
4934PyObject *rsplit_char(PyUnicodeObject *self,
4935 PyObject *list,
4936 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004937 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004938{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004939 register Py_ssize_t i;
4940 register Py_ssize_t j;
4941 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004942 PyObject *str;
4943
4944 for (i = j = len - 1; i >= 0; ) {
4945 if (self->str[i] == ch) {
4946 if (maxcount-- <= 0)
4947 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004948 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004949 j = i = i - 1;
4950 } else
4951 i--;
4952 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004953 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004954 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004955 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004956 if (PyList_Reverse(list) < 0)
4957 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004958 return list;
4959
4960 onError:
4961 Py_DECREF(list);
4962 return NULL;
4963}
4964
4965static
4966PyObject *rsplit_substring(PyUnicodeObject *self,
4967 PyObject *list,
4968 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004969 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004970{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004971 register Py_ssize_t i;
4972 register Py_ssize_t j;
4973 Py_ssize_t len = self->length;
4974 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004975 PyObject *str;
4976
4977 for (i = len - sublen, j = len; i >= 0; ) {
4978 if (Py_UNICODE_MATCH(self, i, substring)) {
4979 if (maxcount-- <= 0)
4980 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004981 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004982 j = i;
4983 i -= sublen;
4984 } else
4985 i--;
4986 }
4987 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004988 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004989 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004990 if (PyList_Reverse(list) < 0)
4991 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004992 return list;
4993
4994 onError:
4995 Py_DECREF(list);
4996 return NULL;
4997}
4998
Guido van Rossumd57fd912000-03-10 22:53:23 +00004999#undef SPLIT_APPEND
5000
5001static
5002PyObject *split(PyUnicodeObject *self,
5003 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005004 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005{
5006 PyObject *list;
5007
5008 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005009 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010
5011 list = PyList_New(0);
5012 if (!list)
5013 return NULL;
5014
5015 if (substring == NULL)
5016 return split_whitespace(self,list,maxcount);
5017
5018 else if (substring->length == 1)
5019 return split_char(self,list,substring->str[0],maxcount);
5020
5021 else if (substring->length == 0) {
5022 Py_DECREF(list);
5023 PyErr_SetString(PyExc_ValueError, "empty separator");
5024 return NULL;
5025 }
5026 else
5027 return split_substring(self,list,substring,maxcount);
5028}
5029
Tim Petersced69f82003-09-16 20:30:58 +00005030static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005031PyObject *rsplit(PyUnicodeObject *self,
5032 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005033 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005034{
5035 PyObject *list;
5036
5037 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005038 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005039
5040 list = PyList_New(0);
5041 if (!list)
5042 return NULL;
5043
5044 if (substring == NULL)
5045 return rsplit_whitespace(self,list,maxcount);
5046
5047 else if (substring->length == 1)
5048 return rsplit_char(self,list,substring->str[0],maxcount);
5049
5050 else if (substring->length == 0) {
5051 Py_DECREF(list);
5052 PyErr_SetString(PyExc_ValueError, "empty separator");
5053 return NULL;
5054 }
5055 else
5056 return rsplit_substring(self,list,substring,maxcount);
5057}
5058
5059static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060PyObject *replace(PyUnicodeObject *self,
5061 PyUnicodeObject *str1,
5062 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005063 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064{
5065 PyUnicodeObject *u;
5066
5067 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005068 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069
Fredrik Lundh347ee272006-05-24 16:35:18 +00005070 if (str1->length == str2->length) {
5071 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005072 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005073 if (str1->length == 1) {
5074 /* replace characters */
5075 Py_UNICODE u1, u2;
5076 if (!findchar(self->str, self->length, str1->str[0]))
5077 goto nothing;
5078 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5079 if (!u)
5080 return NULL;
5081 Py_UNICODE_COPY(u->str, self->str, self->length);
5082 u1 = str1->str[0];
5083 u2 = str2->str[0];
5084 for (i = 0; i < u->length; i++)
5085 if (u->str[i] == u1) {
5086 if (--maxcount < 0)
5087 break;
5088 u->str[i] = u2;
5089 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005091 i = fastsearch(
5092 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005094 if (i < 0)
5095 goto nothing;
5096 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5097 if (!u)
5098 return NULL;
5099 Py_UNICODE_COPY(u->str, self->str, self->length);
5100 while (i <= self->length - str1->length)
5101 if (Py_UNICODE_MATCH(self, i, str1)) {
5102 if (--maxcount < 0)
5103 break;
5104 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5105 i += str1->length;
5106 } else
5107 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005110
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005111 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005112 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005113 Py_UNICODE *p;
5114
5115 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005116 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005117 if (n > maxcount)
5118 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005119 if (n == 0)
5120 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005121 /* new_size = self->length + n * (str2->length - str1->length)); */
5122 delta = (str2->length - str1->length);
5123 if (delta == 0) {
5124 new_size = self->length;
5125 } else {
5126 product = n * (str2->length - str1->length);
5127 if ((product / (str2->length - str1->length)) != n) {
5128 PyErr_SetString(PyExc_OverflowError,
5129 "replace string is too long");
5130 return NULL;
5131 }
5132 new_size = self->length + product;
5133 if (new_size < 0) {
5134 PyErr_SetString(PyExc_OverflowError,
5135 "replace string is too long");
5136 return NULL;
5137 }
5138 }
5139 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005140 if (!u)
5141 return NULL;
5142 i = 0;
5143 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005144 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005145 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005146 while (n-- > 0) {
5147 /* look for next match */
5148 j = i;
5149 while (j <= e) {
5150 if (Py_UNICODE_MATCH(self, j, str1))
5151 break;
5152 j++;
5153 }
5154 if (j > i) {
5155 if (j > e)
5156 break;
5157 /* copy unchanged part [i:j] */
5158 Py_UNICODE_COPY(p, self->str+i, j-i);
5159 p += j - i;
5160 }
5161 /* copy substitution string */
5162 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005163 Py_UNICODE_COPY(p, str2->str, str2->length);
5164 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005165 }
5166 i = j + str1->length;
5167 }
5168 if (i < self->length)
5169 /* copy tail [i:] */
5170 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005171 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005172 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005173 while (n > 0) {
5174 Py_UNICODE_COPY(p, str2->str, str2->length);
5175 p += str2->length;
5176 if (--n <= 0)
5177 break;
5178 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005180 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181 }
5182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005184
5185nothing:
5186 /* nothing to replace; return original string (when possible) */
5187 if (PyUnicode_CheckExact(self)) {
5188 Py_INCREF(self);
5189 return (PyObject *) self;
5190 }
5191 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192}
5193
5194/* --- Unicode Object Methods --------------------------------------------- */
5195
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005196PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197"S.title() -> unicode\n\
5198\n\
5199Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005200characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201
5202static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005203unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 return fixup(self, fixtitle);
5206}
5207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005208PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209"S.capitalize() -> unicode\n\
5210\n\
5211Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005212have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213
5214static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005215unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217 return fixup(self, fixcapitalize);
5218}
5219
5220#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005221PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222"S.capwords() -> unicode\n\
5223\n\
5224Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005225normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226
5227static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005228unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229{
5230 PyObject *list;
5231 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005232 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234 /* Split into words */
5235 list = split(self, NULL, -1);
5236 if (!list)
5237 return NULL;
5238
5239 /* Capitalize each word */
5240 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5241 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5242 fixcapitalize);
5243 if (item == NULL)
5244 goto onError;
5245 Py_DECREF(PyList_GET_ITEM(list, i));
5246 PyList_SET_ITEM(list, i, item);
5247 }
5248
5249 /* Join the words to form a new string */
5250 item = PyUnicode_Join(NULL, list);
5251
5252onError:
5253 Py_DECREF(list);
5254 return (PyObject *)item;
5255}
5256#endif
5257
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005258/* Argument converter. Coerces to a single unicode character */
5259
5260static int
5261convert_uc(PyObject *obj, void *addr)
5262{
5263 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5264 PyObject *uniobj;
5265 Py_UNICODE *unistr;
5266
5267 uniobj = PyUnicode_FromObject(obj);
5268 if (uniobj == NULL) {
5269 PyErr_SetString(PyExc_TypeError,
5270 "The fill character cannot be converted to Unicode");
5271 return 0;
5272 }
5273 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5274 PyErr_SetString(PyExc_TypeError,
5275 "The fill character must be exactly one character long");
5276 Py_DECREF(uniobj);
5277 return 0;
5278 }
5279 unistr = PyUnicode_AS_UNICODE(uniobj);
5280 *fillcharloc = unistr[0];
5281 Py_DECREF(uniobj);
5282 return 1;
5283}
5284
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005285PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005286"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005288Return S centered in a Unicode string of length width. Padding is\n\
5289done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290
5291static PyObject *
5292unicode_center(PyUnicodeObject *self, PyObject *args)
5293{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005294 Py_ssize_t marg, left;
5295 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005296 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297
Thomas Woutersde017742006-02-16 19:34:37 +00005298 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299 return NULL;
5300
Tim Peters7a29bd52001-09-12 03:03:31 +00005301 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302 Py_INCREF(self);
5303 return (PyObject*) self;
5304 }
5305
5306 marg = width - self->length;
5307 left = marg / 2 + (marg & width & 1);
5308
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005309 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310}
5311
Marc-André Lemburge5034372000-08-08 08:04:29 +00005312#if 0
5313
5314/* This code should go into some future Unicode collation support
5315 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005316 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005317
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005318/* speedy UTF-16 code point order comparison */
5319/* gleaned from: */
5320/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5321
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005322static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005323{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005324 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005325 0, 0, 0, 0, 0, 0, 0, 0,
5326 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005327 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005328};
5329
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330static int
5331unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5332{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005333 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005334
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335 Py_UNICODE *s1 = str1->str;
5336 Py_UNICODE *s2 = str2->str;
5337
5338 len1 = str1->length;
5339 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005340
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005342 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005343
5344 c1 = *s1++;
5345 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005346
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005347 if (c1 > (1<<11) * 26)
5348 c1 += utf16Fixup[c1>>11];
5349 if (c2 > (1<<11) * 26)
5350 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005351 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005352
5353 if (c1 != c2)
5354 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005355
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005356 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357 }
5358
5359 return (len1 < len2) ? -1 : (len1 != len2);
5360}
5361
Marc-André Lemburge5034372000-08-08 08:04:29 +00005362#else
5363
5364static int
5365unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5366{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005367 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005368
5369 Py_UNICODE *s1 = str1->str;
5370 Py_UNICODE *s2 = str2->str;
5371
5372 len1 = str1->length;
5373 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005374
Marc-André Lemburge5034372000-08-08 08:04:29 +00005375 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005376 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005377
Fredrik Lundh45714e92001-06-26 16:39:36 +00005378 c1 = *s1++;
5379 c2 = *s2++;
5380
5381 if (c1 != c2)
5382 return (c1 < c2) ? -1 : 1;
5383
Marc-André Lemburge5034372000-08-08 08:04:29 +00005384 len1--; len2--;
5385 }
5386
5387 return (len1 < len2) ? -1 : (len1 != len2);
5388}
5389
5390#endif
5391
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392int PyUnicode_Compare(PyObject *left,
5393 PyObject *right)
5394{
5395 PyUnicodeObject *u = NULL, *v = NULL;
5396 int result;
5397
5398 /* Coerce the two arguments */
5399 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5400 if (u == NULL)
5401 goto onError;
5402 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5403 if (v == NULL)
5404 goto onError;
5405
Thomas Wouters7e474022000-07-16 12:04:32 +00005406 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 if (v == u) {
5408 Py_DECREF(u);
5409 Py_DECREF(v);
5410 return 0;
5411 }
5412
5413 result = unicode_compare(u, v);
5414
5415 Py_DECREF(u);
5416 Py_DECREF(v);
5417 return result;
5418
5419onError:
5420 Py_XDECREF(u);
5421 Py_XDECREF(v);
5422 return -1;
5423}
5424
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00005425PyObject *PyUnicode_RichCompare(PyObject *left,
5426 PyObject *right,
5427 int op)
5428{
5429 int result;
5430
5431 result = PyUnicode_Compare(left, right);
5432 if (result == -1 && PyErr_Occurred())
5433 goto onError;
5434
5435 /* Convert the return value to a Boolean */
5436 switch (op) {
5437 case Py_EQ:
5438 result = (result == 0);
5439 break;
5440 case Py_NE:
5441 result = (result != 0);
5442 break;
5443 case Py_LE:
5444 result = (result <= 0);
5445 break;
5446 case Py_GE:
5447 result = (result >= 0);
5448 break;
5449 case Py_LT:
5450 result = (result == -1);
5451 break;
5452 case Py_GT:
5453 result = (result == 1);
5454 break;
5455 }
5456 return PyBool_FromLong(result);
5457
5458 onError:
5459
5460 /* Standard case
5461
5462 Type errors mean that PyUnicode_FromObject() could not convert
5463 one of the arguments (usually the right hand side) to Unicode,
5464 ie. we can't handle the comparison request. However, it is
5465 possible that the other object knows a comparison method, which
5466 is why we return Py_NotImplemented to give the other object a
5467 chance.
5468
5469 */
5470 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5471 PyErr_Clear();
5472 Py_INCREF(Py_NotImplemented);
5473 return Py_NotImplemented;
5474 }
5475 if (op != Py_EQ && op != Py_NE)
5476 return NULL;
5477
5478 /* Equality comparison.
5479
5480 This is a special case: we silence any PyExc_UnicodeDecodeError
5481 and instead turn it into a PyErr_UnicodeWarning.
5482
5483 */
5484 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5485 return NULL;
5486 PyErr_Clear();
5487 if (PyErr_Warn(PyExc_UnicodeWarning,
5488 (op == Py_EQ) ?
5489 "Unicode equal comparison "
5490 "failed to convert both arguments to Unicode - "
5491 "interpreting them as being unequal" :
5492 "Unicode unequal comparison "
5493 "failed to convert both arguments to Unicode - "
5494 "interpreting them as being unequal"
5495 ) < 0)
5496 return NULL;
5497 result = (op == Py_NE);
5498 return PyBool_FromLong(result);
5499}
5500
Guido van Rossum403d68b2000-03-13 15:55:09 +00005501int PyUnicode_Contains(PyObject *container,
5502 PyObject *element)
5503{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005504 PyObject *str, *sub;
5505 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005506
5507 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005508 sub = PyUnicode_FromObject(element);
5509 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005510 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005511 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005512 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005513 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005514
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005515 str = PyUnicode_FromObject(container);
5516 if (!str) {
5517 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005518 return -1;
5519 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005520
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005521 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00005522
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005523 Py_DECREF(str);
5524 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00005525
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005526 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005527}
5528
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529/* Concat to string or Unicode object giving a new Unicode object. */
5530
5531PyObject *PyUnicode_Concat(PyObject *left,
5532 PyObject *right)
5533{
5534 PyUnicodeObject *u = NULL, *v = NULL, *w;
5535
5536 /* Coerce the two arguments */
5537 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5538 if (u == NULL)
5539 goto onError;
5540 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5541 if (v == NULL)
5542 goto onError;
5543
5544 /* Shortcuts */
5545 if (v == unicode_empty) {
5546 Py_DECREF(v);
5547 return (PyObject *)u;
5548 }
5549 if (u == unicode_empty) {
5550 Py_DECREF(u);
5551 return (PyObject *)v;
5552 }
5553
5554 /* Concat the two Unicode strings */
5555 w = _PyUnicode_New(u->length + v->length);
5556 if (w == NULL)
5557 goto onError;
5558 Py_UNICODE_COPY(w->str, u->str, u->length);
5559 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5560
5561 Py_DECREF(u);
5562 Py_DECREF(v);
5563 return (PyObject *)w;
5564
5565onError:
5566 Py_XDECREF(u);
5567 Py_XDECREF(v);
5568 return NULL;
5569}
5570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005571PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572"S.count(sub[, start[, end]]) -> int\n\
5573\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005574Return the number of non-overlapping occurrences of substring sub in\n\
5575Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005576interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577
5578static PyObject *
5579unicode_count(PyUnicodeObject *self, PyObject *args)
5580{
5581 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005582 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005583 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 PyObject *result;
5585
Guido van Rossumb8872e62000-05-09 14:14:27 +00005586 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5587 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 return NULL;
5589
5590 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005591 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592 if (substring == NULL)
5593 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005594
Fredrik Lundhc8162812006-05-26 19:33:03 +00005595 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005597 result = PyInt_FromSsize_t(
5598 stringlib_count(self->str + start, end - start,
5599 substring->str, substring->length)
5600 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601
5602 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005603
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604 return result;
5605}
5606
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005607PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005608"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005610Encodes S using the codec registered for encoding. encoding defaults\n\
5611to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005612handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005613a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5614'xmlcharrefreplace' as well as any other name registered with\n\
5615codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616
5617static PyObject *
5618unicode_encode(PyUnicodeObject *self, PyObject *args)
5619{
5620 char *encoding = NULL;
5621 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005622 PyObject *v;
5623
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5625 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005626 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005627 if (v == NULL)
5628 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005629 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5630 PyErr_Format(PyExc_TypeError,
5631 "encoder did not return a string/unicode object "
5632 "(type=%.400s)",
5633 v->ob_type->tp_name);
5634 Py_DECREF(v);
5635 return NULL;
5636 }
5637 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005638
5639 onError:
5640 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005641}
5642
5643PyDoc_STRVAR(decode__doc__,
5644"S.decode([encoding[,errors]]) -> string or unicode\n\
5645\n\
5646Decodes S using the codec registered for encoding. encoding defaults\n\
5647to the default encoding. errors may be given to set a different error\n\
5648handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5649a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5650as well as any other name registerd with codecs.register_error that is\n\
5651able to handle UnicodeDecodeErrors.");
5652
5653static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005654unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005655{
5656 char *encoding = NULL;
5657 char *errors = NULL;
5658 PyObject *v;
5659
5660 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5661 return NULL;
5662 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005663 if (v == NULL)
5664 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005665 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5666 PyErr_Format(PyExc_TypeError,
5667 "decoder did not return a string/unicode object "
5668 "(type=%.400s)",
5669 v->ob_type->tp_name);
5670 Py_DECREF(v);
5671 return NULL;
5672 }
5673 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005674
5675 onError:
5676 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677}
5678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005679PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680"S.expandtabs([tabsize]) -> unicode\n\
5681\n\
5682Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005683If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684
5685static PyObject*
5686unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5687{
5688 Py_UNICODE *e;
5689 Py_UNICODE *p;
5690 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005691 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 PyUnicodeObject *u;
5693 int tabsize = 8;
5694
5695 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5696 return NULL;
5697
Thomas Wouters7e474022000-07-16 12:04:32 +00005698 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 i = j = 0;
5700 e = self->str + self->length;
5701 for (p = self->str; p < e; p++)
5702 if (*p == '\t') {
5703 if (tabsize > 0)
5704 j += tabsize - (j % tabsize);
5705 }
5706 else {
5707 j++;
5708 if (*p == '\n' || *p == '\r') {
5709 i += j;
5710 j = 0;
5711 }
5712 }
5713
5714 /* Second pass: create output string and fill it */
5715 u = _PyUnicode_New(i + j);
5716 if (!u)
5717 return NULL;
5718
5719 j = 0;
5720 q = u->str;
5721
5722 for (p = self->str; p < e; p++)
5723 if (*p == '\t') {
5724 if (tabsize > 0) {
5725 i = tabsize - (j % tabsize);
5726 j += i;
5727 while (i--)
5728 *q++ = ' ';
5729 }
5730 }
5731 else {
5732 j++;
5733 *q++ = *p;
5734 if (*p == '\n' || *p == '\r')
5735 j = 0;
5736 }
5737
5738 return (PyObject*) u;
5739}
5740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005741PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742"S.find(sub [,start [,end]]) -> int\n\
5743\n\
5744Return the lowest index in S where substring sub is found,\n\
5745such that sub is contained within s[start,end]. Optional\n\
5746arguments start and end are interpreted as in slice notation.\n\
5747\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005748Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749
5750static PyObject *
5751unicode_find(PyUnicodeObject *self, PyObject *args)
5752{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005753 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005754 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005755 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005756 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757
Guido van Rossumb8872e62000-05-09 14:14:27 +00005758 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5759 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005761 substring = PyUnicode_FromObject(substring);
5762 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 return NULL;
5764
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005765 result = stringlib_find_slice(
5766 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5767 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5768 start, end
5769 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770
5771 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005772
5773 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774}
5775
5776static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005777unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778{
5779 if (index < 0 || index >= self->length) {
5780 PyErr_SetString(PyExc_IndexError, "string index out of range");
5781 return NULL;
5782 }
5783
5784 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5785}
5786
5787static long
5788unicode_hash(PyUnicodeObject *self)
5789{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005790 /* Since Unicode objects compare equal to their ASCII string
5791 counterparts, they should use the individual character values
5792 as basis for their hash value. This is needed to assure that
5793 strings and Unicode objects behave in the same way as
5794 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795
Martin v. Löwis18e16552006-02-15 17:27:45 +00005796 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005797 register Py_UNICODE *p;
5798 register long x;
5799
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 if (self->hash != -1)
5801 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005802 len = PyUnicode_GET_SIZE(self);
5803 p = PyUnicode_AS_UNICODE(self);
5804 x = *p << 7;
5805 while (--len >= 0)
5806 x = (1000003*x) ^ *p++;
5807 x ^= PyUnicode_GET_SIZE(self);
5808 if (x == -1)
5809 x = -2;
5810 self->hash = x;
5811 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812}
5813
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005814PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815"S.index(sub [,start [,end]]) -> int\n\
5816\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005817Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818
5819static PyObject *
5820unicode_index(PyUnicodeObject *self, PyObject *args)
5821{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005822 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005823 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005824 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005825 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826
Guido van Rossumb8872e62000-05-09 14:14:27 +00005827 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5828 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005830 substring = PyUnicode_FromObject(substring);
5831 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 return NULL;
5833
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005834 result = stringlib_find_slice(
5835 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5836 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5837 start, end
5838 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839
5840 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005841
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 if (result < 0) {
5843 PyErr_SetString(PyExc_ValueError, "substring not found");
5844 return NULL;
5845 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005846
Martin v. Löwis18e16552006-02-15 17:27:45 +00005847 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848}
5849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005850PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005851"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005853Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005854at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855
5856static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005857unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858{
5859 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5860 register const Py_UNICODE *e;
5861 int cased;
5862
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 /* Shortcut for single character strings */
5864 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005865 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005867 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005868 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005869 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005870
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 e = p + PyUnicode_GET_SIZE(self);
5872 cased = 0;
5873 for (; p < e; p++) {
5874 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005875
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005877 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 else if (!cased && Py_UNICODE_ISLOWER(ch))
5879 cased = 1;
5880 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005881 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882}
5883
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005884PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005885"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005887Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005888at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889
5890static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005891unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892{
5893 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5894 register const Py_UNICODE *e;
5895 int cased;
5896
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 /* Shortcut for single character strings */
5898 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005899 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005901 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005902 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005903 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005904
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 e = p + PyUnicode_GET_SIZE(self);
5906 cased = 0;
5907 for (; p < e; p++) {
5908 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005909
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005911 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 else if (!cased && Py_UNICODE_ISUPPER(ch))
5913 cased = 1;
5914 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005915 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916}
5917
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005918PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005919"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005921Return True if S is a titlecased string and there is at least one\n\
5922character in S, i.e. upper- and titlecase characters may only\n\
5923follow uncased characters and lowercase characters only cased ones.\n\
5924Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925
5926static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005927unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928{
5929 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5930 register const Py_UNICODE *e;
5931 int cased, previous_is_cased;
5932
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 /* Shortcut for single character strings */
5934 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005935 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5936 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005938 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005939 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005940 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005941
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 e = p + PyUnicode_GET_SIZE(self);
5943 cased = 0;
5944 previous_is_cased = 0;
5945 for (; p < e; p++) {
5946 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005947
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5949 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005950 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951 previous_is_cased = 1;
5952 cased = 1;
5953 }
5954 else if (Py_UNICODE_ISLOWER(ch)) {
5955 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005956 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 previous_is_cased = 1;
5958 cased = 1;
5959 }
5960 else
5961 previous_is_cased = 0;
5962 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005963 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964}
5965
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005966PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005967"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005969Return True if all characters in S are whitespace\n\
5970and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971
5972static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005973unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974{
5975 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5976 register const Py_UNICODE *e;
5977
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 /* Shortcut for single character strings */
5979 if (PyUnicode_GET_SIZE(self) == 1 &&
5980 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005981 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005983 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005984 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005985 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005986
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 e = p + PyUnicode_GET_SIZE(self);
5988 for (; p < e; p++) {
5989 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005990 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005992 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993}
5994
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005995PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005996"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005997\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005998Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005999and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006000
6001static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006002unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006003{
6004 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6005 register const Py_UNICODE *e;
6006
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006007 /* Shortcut for single character strings */
6008 if (PyUnicode_GET_SIZE(self) == 1 &&
6009 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006010 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006011
6012 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006013 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006014 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006015
6016 e = p + PyUnicode_GET_SIZE(self);
6017 for (; p < e; p++) {
6018 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006019 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006020 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006021 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006022}
6023
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006024PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006025"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006026\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006027Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006028and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006029
6030static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006031unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006032{
6033 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6034 register const Py_UNICODE *e;
6035
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006036 /* Shortcut for single character strings */
6037 if (PyUnicode_GET_SIZE(self) == 1 &&
6038 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006039 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006040
6041 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006042 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006043 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006044
6045 e = p + PyUnicode_GET_SIZE(self);
6046 for (; p < e; p++) {
6047 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006048 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006049 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006050 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006051}
6052
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006053PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006054"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006056Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006057False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058
6059static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006060unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061{
6062 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6063 register const Py_UNICODE *e;
6064
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065 /* Shortcut for single character strings */
6066 if (PyUnicode_GET_SIZE(self) == 1 &&
6067 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006068 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006070 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006071 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006072 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006073
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074 e = p + PyUnicode_GET_SIZE(self);
6075 for (; p < e; p++) {
6076 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006077 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006079 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080}
6081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006082PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006083"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006085Return True if all characters in S are digits\n\
6086and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087
6088static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006089unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090{
6091 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6092 register const Py_UNICODE *e;
6093
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 /* Shortcut for single character strings */
6095 if (PyUnicode_GET_SIZE(self) == 1 &&
6096 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006097 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006099 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006100 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006101 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006102
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 e = p + PyUnicode_GET_SIZE(self);
6104 for (; p < e; p++) {
6105 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006106 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006108 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109}
6110
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006111PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006112"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006114Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006115False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116
6117static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006118unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119{
6120 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6121 register const Py_UNICODE *e;
6122
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 /* Shortcut for single character strings */
6124 if (PyUnicode_GET_SIZE(self) == 1 &&
6125 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006126 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006128 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006129 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006130 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006131
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 e = p + PyUnicode_GET_SIZE(self);
6133 for (; p < e; p++) {
6134 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006135 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006137 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138}
6139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006140PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141"S.join(sequence) -> unicode\n\
6142\n\
6143Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006144sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145
6146static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006147unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006149 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150}
6151
Martin v. Löwis18e16552006-02-15 17:27:45 +00006152static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153unicode_length(PyUnicodeObject *self)
6154{
6155 return self->length;
6156}
6157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006158PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006159"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160\n\
6161Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006162done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163
6164static PyObject *
6165unicode_ljust(PyUnicodeObject *self, PyObject *args)
6166{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006167 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006168 Py_UNICODE fillchar = ' ';
6169
Martin v. Löwis412fb672006-04-13 06:34:32 +00006170 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 return NULL;
6172
Tim Peters7a29bd52001-09-12 03:03:31 +00006173 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174 Py_INCREF(self);
6175 return (PyObject*) self;
6176 }
6177
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006178 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179}
6180
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006181PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182"S.lower() -> unicode\n\
6183\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006184Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185
6186static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006187unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189 return fixup(self, fixlower);
6190}
6191
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006192#define LEFTSTRIP 0
6193#define RIGHTSTRIP 1
6194#define BOTHSTRIP 2
6195
6196/* Arrays indexed by above */
6197static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6198
6199#define STRIPNAME(i) (stripformat[i]+3)
6200
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006201/* externally visible for str.strip(unicode) */
6202PyObject *
6203_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6204{
6205 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006206 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006207 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006208 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6209 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006210
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006211 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6212
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006213 i = 0;
6214 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006215 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6216 i++;
6217 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006218 }
6219
6220 j = len;
6221 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006222 do {
6223 j--;
6224 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6225 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006226 }
6227
6228 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006229 Py_INCREF(self);
6230 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006231 }
6232 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006233 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006234}
6235
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236
6237static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006238do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006240 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006241 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006242
6243 i = 0;
6244 if (striptype != RIGHTSTRIP) {
6245 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6246 i++;
6247 }
6248 }
6249
6250 j = len;
6251 if (striptype != LEFTSTRIP) {
6252 do {
6253 j--;
6254 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6255 j++;
6256 }
6257
6258 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6259 Py_INCREF(self);
6260 return (PyObject*)self;
6261 }
6262 else
6263 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264}
6265
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006266
6267static PyObject *
6268do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6269{
6270 PyObject *sep = NULL;
6271
6272 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6273 return NULL;
6274
6275 if (sep != NULL && sep != Py_None) {
6276 if (PyUnicode_Check(sep))
6277 return _PyUnicode_XStrip(self, striptype, sep);
6278 else if (PyString_Check(sep)) {
6279 PyObject *res;
6280 sep = PyUnicode_FromObject(sep);
6281 if (sep==NULL)
6282 return NULL;
6283 res = _PyUnicode_XStrip(self, striptype, sep);
6284 Py_DECREF(sep);
6285 return res;
6286 }
6287 else {
6288 PyErr_Format(PyExc_TypeError,
6289 "%s arg must be None, unicode or str",
6290 STRIPNAME(striptype));
6291 return NULL;
6292 }
6293 }
6294
6295 return do_strip(self, striptype);
6296}
6297
6298
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006299PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006300"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006301\n\
6302Return a copy of the string S with leading and trailing\n\
6303whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006304If chars is given and not None, remove characters in chars instead.\n\
6305If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006306
6307static PyObject *
6308unicode_strip(PyUnicodeObject *self, PyObject *args)
6309{
6310 if (PyTuple_GET_SIZE(args) == 0)
6311 return do_strip(self, BOTHSTRIP); /* Common case */
6312 else
6313 return do_argstrip(self, BOTHSTRIP, args);
6314}
6315
6316
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006317PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006318"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006319\n\
6320Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006321If chars is given and not None, remove characters in chars instead.\n\
6322If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006323
6324static PyObject *
6325unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6326{
6327 if (PyTuple_GET_SIZE(args) == 0)
6328 return do_strip(self, LEFTSTRIP); /* Common case */
6329 else
6330 return do_argstrip(self, LEFTSTRIP, args);
6331}
6332
6333
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006334PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006335"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006336\n\
6337Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006338If chars is given and not None, remove characters in chars instead.\n\
6339If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006340
6341static PyObject *
6342unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6343{
6344 if (PyTuple_GET_SIZE(args) == 0)
6345 return do_strip(self, RIGHTSTRIP); /* Common case */
6346 else
6347 return do_argstrip(self, RIGHTSTRIP, args);
6348}
6349
6350
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006352unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353{
6354 PyUnicodeObject *u;
6355 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006356 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006357 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358
6359 if (len < 0)
6360 len = 0;
6361
Tim Peters7a29bd52001-09-12 03:03:31 +00006362 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363 /* no repeat, return original string */
6364 Py_INCREF(str);
6365 return (PyObject*) str;
6366 }
Tim Peters8f422462000-09-09 06:13:41 +00006367
6368 /* ensure # of chars needed doesn't overflow int and # of bytes
6369 * needed doesn't overflow size_t
6370 */
6371 nchars = len * str->length;
6372 if (len && nchars / len != str->length) {
6373 PyErr_SetString(PyExc_OverflowError,
6374 "repeated string is too long");
6375 return NULL;
6376 }
6377 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6378 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6379 PyErr_SetString(PyExc_OverflowError,
6380 "repeated string is too long");
6381 return NULL;
6382 }
6383 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384 if (!u)
6385 return NULL;
6386
6387 p = u->str;
6388
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006389 if (str->length == 1 && len > 0) {
6390 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006391 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006392 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006393 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006394 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006395 done = str->length;
6396 }
6397 while (done < nchars) {
6398 int n = (done <= nchars-done) ? done : nchars-done;
6399 Py_UNICODE_COPY(p+done, p, n);
6400 done += n;
6401 }
6402 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403
6404 return (PyObject*) u;
6405}
6406
6407PyObject *PyUnicode_Replace(PyObject *obj,
6408 PyObject *subobj,
6409 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006410 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411{
6412 PyObject *self;
6413 PyObject *str1;
6414 PyObject *str2;
6415 PyObject *result;
6416
6417 self = PyUnicode_FromObject(obj);
6418 if (self == NULL)
6419 return NULL;
6420 str1 = PyUnicode_FromObject(subobj);
6421 if (str1 == NULL) {
6422 Py_DECREF(self);
6423 return NULL;
6424 }
6425 str2 = PyUnicode_FromObject(replobj);
6426 if (str2 == NULL) {
6427 Py_DECREF(self);
6428 Py_DECREF(str1);
6429 return NULL;
6430 }
Tim Petersced69f82003-09-16 20:30:58 +00006431 result = replace((PyUnicodeObject *)self,
6432 (PyUnicodeObject *)str1,
6433 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 maxcount);
6435 Py_DECREF(self);
6436 Py_DECREF(str1);
6437 Py_DECREF(str2);
6438 return result;
6439}
6440
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006441PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442"S.replace (old, new[, maxsplit]) -> unicode\n\
6443\n\
6444Return a copy of S with all occurrences of substring\n\
6445old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006446given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447
6448static PyObject*
6449unicode_replace(PyUnicodeObject *self, PyObject *args)
6450{
6451 PyUnicodeObject *str1;
6452 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006453 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454 PyObject *result;
6455
Martin v. Löwis18e16552006-02-15 17:27:45 +00006456 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 return NULL;
6458 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6459 if (str1 == NULL)
6460 return NULL;
6461 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006462 if (str2 == NULL) {
6463 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466
6467 result = replace(self, str1, str2, maxcount);
6468
6469 Py_DECREF(str1);
6470 Py_DECREF(str2);
6471 return result;
6472}
6473
6474static
6475PyObject *unicode_repr(PyObject *unicode)
6476{
6477 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6478 PyUnicode_GET_SIZE(unicode),
6479 1);
6480}
6481
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006482PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483"S.rfind(sub [,start [,end]]) -> int\n\
6484\n\
6485Return the highest index in S where substring sub is found,\n\
6486such that sub is contained within s[start,end]. Optional\n\
6487arguments start and end are interpreted as in slice notation.\n\
6488\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006489Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490
6491static PyObject *
6492unicode_rfind(PyUnicodeObject *self, PyObject *args)
6493{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006494 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006495 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006496 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006497 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498
Guido van Rossumb8872e62000-05-09 14:14:27 +00006499 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6500 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006502 substring = PyUnicode_FromObject(substring);
6503 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 return NULL;
6505
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006506 result = stringlib_rfind_slice(
6507 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6508 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6509 start, end
6510 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511
6512 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006513
6514 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515}
6516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006517PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518"S.rindex(sub [,start [,end]]) -> int\n\
6519\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006520Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521
6522static PyObject *
6523unicode_rindex(PyUnicodeObject *self, PyObject *args)
6524{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006525 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006526 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006527 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006528 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529
Guido van Rossumb8872e62000-05-09 14:14:27 +00006530 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6531 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006533 substring = PyUnicode_FromObject(substring);
6534 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535 return NULL;
6536
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006537 result = stringlib_rfind_slice(
6538 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6539 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6540 start, end
6541 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542
6543 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006544
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 if (result < 0) {
6546 PyErr_SetString(PyExc_ValueError, "substring not found");
6547 return NULL;
6548 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006549 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550}
6551
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006552PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006553"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554\n\
6555Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006556done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557
6558static PyObject *
6559unicode_rjust(PyUnicodeObject *self, PyObject *args)
6560{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006561 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006562 Py_UNICODE fillchar = ' ';
6563
Martin v. Löwis412fb672006-04-13 06:34:32 +00006564 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565 return NULL;
6566
Tim Peters7a29bd52001-09-12 03:03:31 +00006567 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 Py_INCREF(self);
6569 return (PyObject*) self;
6570 }
6571
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006572 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573}
6574
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006576unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577{
6578 /* standard clamping */
6579 if (start < 0)
6580 start = 0;
6581 if (end < 0)
6582 end = 0;
6583 if (end > self->length)
6584 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006585 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 /* full slice, return original string */
6587 Py_INCREF(self);
6588 return (PyObject*) self;
6589 }
6590 if (start > end)
6591 start = end;
6592 /* copy slice */
6593 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6594 end - start);
6595}
6596
6597PyObject *PyUnicode_Split(PyObject *s,
6598 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006599 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600{
6601 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006602
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 s = PyUnicode_FromObject(s);
6604 if (s == NULL)
6605 return NULL;
6606 if (sep != NULL) {
6607 sep = PyUnicode_FromObject(sep);
6608 if (sep == NULL) {
6609 Py_DECREF(s);
6610 return NULL;
6611 }
6612 }
6613
6614 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6615
6616 Py_DECREF(s);
6617 Py_XDECREF(sep);
6618 return result;
6619}
6620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006621PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622"S.split([sep [,maxsplit]]) -> list of strings\n\
6623\n\
6624Return a list of the words in S, using sep as the\n\
6625delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006626splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006627any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628
6629static PyObject*
6630unicode_split(PyUnicodeObject *self, PyObject *args)
6631{
6632 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006633 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634
Martin v. Löwis18e16552006-02-15 17:27:45 +00006635 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636 return NULL;
6637
6638 if (substring == Py_None)
6639 return split(self, NULL, maxcount);
6640 else if (PyUnicode_Check(substring))
6641 return split(self, (PyUnicodeObject *)substring, maxcount);
6642 else
6643 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6644}
6645
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006646PyObject *
6647PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6648{
6649 PyObject* str_obj;
6650 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006651 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00006652
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006653 str_obj = PyUnicode_FromObject(str_in);
6654 if (!str_obj)
6655 return NULL;
6656 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00006657 if (!sep_obj) {
6658 Py_DECREF(str_obj);
6659 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006660 }
6661
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006662 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00006663 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6664 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6665 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006666
Fredrik Lundhb9479482006-05-26 17:22:38 +00006667 Py_DECREF(sep_obj);
6668 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006669
6670 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006671}
6672
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006673
6674PyObject *
6675PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6676{
6677 PyObject* str_obj;
6678 PyObject* sep_obj;
6679 PyObject* out;
6680
6681 str_obj = PyUnicode_FromObject(str_in);
6682 if (!str_obj)
6683 return NULL;
6684 sep_obj = PyUnicode_FromObject(sep_in);
6685 if (!sep_obj) {
6686 Py_DECREF(str_obj);
6687 return NULL;
6688 }
6689
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006690 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006691 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6692 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6693 );
6694
6695 Py_DECREF(sep_obj);
6696 Py_DECREF(str_obj);
6697
6698 return out;
6699}
6700
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006701PyDoc_STRVAR(partition__doc__,
6702"S.partition(sep) -> (head, sep, tail)\n\
6703\n\
6704Searches for the separator sep in S, and returns the part before it,\n\
6705the separator itself, and the part after it. If the separator is not\n\
6706found, returns S and two empty strings.");
6707
6708static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00006709unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006710{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006711 return PyUnicode_Partition((PyObject *)self, separator);
6712}
6713
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006714PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00006715"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006716\n\
6717Searches for the separator sep in S, starting at the end of S, and returns\n\
6718the part before it, the separator itself, and the part after it. If the\n\
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00006719separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006720
6721static PyObject*
6722unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6723{
6724 return PyUnicode_RPartition((PyObject *)self, separator);
6725}
6726
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006727PyObject *PyUnicode_RSplit(PyObject *s,
6728 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006729 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006730{
6731 PyObject *result;
6732
6733 s = PyUnicode_FromObject(s);
6734 if (s == NULL)
6735 return NULL;
6736 if (sep != NULL) {
6737 sep = PyUnicode_FromObject(sep);
6738 if (sep == NULL) {
6739 Py_DECREF(s);
6740 return NULL;
6741 }
6742 }
6743
6744 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6745
6746 Py_DECREF(s);
6747 Py_XDECREF(sep);
6748 return result;
6749}
6750
6751PyDoc_STRVAR(rsplit__doc__,
6752"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6753\n\
6754Return a list of the words in S, using sep as the\n\
6755delimiter string, starting at the end of the string and\n\
6756working to the front. If maxsplit is given, at most maxsplit\n\
6757splits are done. If sep is not specified, any whitespace string\n\
6758is a separator.");
6759
6760static PyObject*
6761unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6762{
6763 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006764 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006765
Martin v. Löwis18e16552006-02-15 17:27:45 +00006766 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006767 return NULL;
6768
6769 if (substring == Py_None)
6770 return rsplit(self, NULL, maxcount);
6771 else if (PyUnicode_Check(substring))
6772 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6773 else
6774 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6775}
6776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006777PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006778"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779\n\
6780Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006781Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006782is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783
6784static PyObject*
6785unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6786{
Guido van Rossum86662912000-04-11 15:38:46 +00006787 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788
Guido van Rossum86662912000-04-11 15:38:46 +00006789 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790 return NULL;
6791
Guido van Rossum86662912000-04-11 15:38:46 +00006792 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793}
6794
6795static
6796PyObject *unicode_str(PyUnicodeObject *self)
6797{
Fred Drakee4315f52000-05-09 19:53:39 +00006798 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799}
6800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006801PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802"S.swapcase() -> unicode\n\
6803\n\
6804Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006805and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806
6807static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006808unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810 return fixup(self, fixswapcase);
6811}
6812
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006813PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814"S.translate(table) -> unicode\n\
6815\n\
6816Return a copy of the string S, where all characters have been mapped\n\
6817through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006818Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6819Unmapped characters are left untouched. Characters mapped to None\n\
6820are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821
6822static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006823unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824{
Tim Petersced69f82003-09-16 20:30:58 +00006825 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006827 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828 "ignore");
6829}
6830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006831PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832"S.upper() -> unicode\n\
6833\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006834Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835
6836static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006837unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839 return fixup(self, fixupper);
6840}
6841
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006842PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843"S.zfill(width) -> unicode\n\
6844\n\
6845Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006846of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847
6848static PyObject *
6849unicode_zfill(PyUnicodeObject *self, PyObject *args)
6850{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006851 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852 PyUnicodeObject *u;
6853
Martin v. Löwis18e16552006-02-15 17:27:45 +00006854 Py_ssize_t width;
6855 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 return NULL;
6857
6858 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006859 if (PyUnicode_CheckExact(self)) {
6860 Py_INCREF(self);
6861 return (PyObject*) self;
6862 }
6863 else
6864 return PyUnicode_FromUnicode(
6865 PyUnicode_AS_UNICODE(self),
6866 PyUnicode_GET_SIZE(self)
6867 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 }
6869
6870 fill = width - self->length;
6871
6872 u = pad(self, fill, 0, '0');
6873
Walter Dörwald068325e2002-04-15 13:36:47 +00006874 if (u == NULL)
6875 return NULL;
6876
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 if (u->str[fill] == '+' || u->str[fill] == '-') {
6878 /* move sign to beginning of string */
6879 u->str[0] = u->str[fill];
6880 u->str[fill] = '0';
6881 }
6882
6883 return (PyObject*) u;
6884}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885
6886#if 0
6887static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006888unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 return PyInt_FromLong(unicode_freelist_size);
6891}
6892#endif
6893
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006894PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006895"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006897Return True if S starts with the specified prefix, False otherwise.\n\
6898With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00006899With optional end, stop comparing S at that position.\n\
6900prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901
6902static PyObject *
6903unicode_startswith(PyUnicodeObject *self,
6904 PyObject *args)
6905{
Georg Brandl24250812006-06-09 18:45:48 +00006906 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006908 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006909 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00006910 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911
Georg Brandl24250812006-06-09 18:45:48 +00006912 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00006913 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00006915 if (PyTuple_Check(subobj)) {
6916 Py_ssize_t i;
6917 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6918 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6919 PyTuple_GET_ITEM(subobj, i));
6920 if (substring == NULL)
6921 return NULL;
6922 result = tailmatch(self, substring, start, end, -1);
6923 Py_DECREF(substring);
6924 if (result) {
6925 Py_RETURN_TRUE;
6926 }
6927 }
6928 /* nothing matched */
6929 Py_RETURN_FALSE;
6930 }
6931 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00006933 return NULL;
6934 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00006936 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937}
6938
6939
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006940PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006941"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006943Return True if S ends with the specified suffix, False otherwise.\n\
6944With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00006945With optional end, stop comparing S at that position.\n\
6946suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947
6948static PyObject *
6949unicode_endswith(PyUnicodeObject *self,
6950 PyObject *args)
6951{
Georg Brandl24250812006-06-09 18:45:48 +00006952 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006954 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006955 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00006956 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957
Georg Brandl24250812006-06-09 18:45:48 +00006958 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
6959 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00006961 if (PyTuple_Check(subobj)) {
6962 Py_ssize_t i;
6963 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6964 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6965 PyTuple_GET_ITEM(subobj, i));
6966 if (substring == NULL)
6967 return NULL;
6968 result = tailmatch(self, substring, start, end, +1);
6969 Py_DECREF(substring);
6970 if (result) {
6971 Py_RETURN_TRUE;
6972 }
6973 }
6974 Py_RETURN_FALSE;
6975 }
6976 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00006978 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979
Georg Brandl24250812006-06-09 18:45:48 +00006980 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00006982 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983}
6984
6985
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006986
6987static PyObject *
6988unicode_getnewargs(PyUnicodeObject *v)
6989{
6990 return Py_BuildValue("(u#)", v->str, v->length);
6991}
6992
6993
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994static PyMethodDef unicode_methods[] = {
6995
6996 /* Order is according to common usage: often used methods should
6997 appear first, since lookup is done sequentially. */
6998
Georg Brandlecdc0a92006-03-30 12:19:07 +00006999 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007000 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7001 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007002 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007003 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7004 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7005 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7006 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7007 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7008 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7009 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007010 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007011 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7012 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7013 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007014 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007015 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007016/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7017 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7018 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7019 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007020 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007021 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007022 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007023 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007024 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7025 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7026 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7027 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7028 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7029 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7030 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7031 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7032 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7033 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7034 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7035 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7036 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7037 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007038 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007039#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007040 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041#endif
7042
7043#if 0
7044 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007045 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046#endif
7047
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007048 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049 {NULL, NULL}
7050};
7051
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007052static PyObject *
7053unicode_mod(PyObject *v, PyObject *w)
7054{
7055 if (!PyUnicode_Check(v)) {
7056 Py_INCREF(Py_NotImplemented);
7057 return Py_NotImplemented;
7058 }
7059 return PyUnicode_Format(v, w);
7060}
7061
7062static PyNumberMethods unicode_as_number = {
7063 0, /*nb_add*/
7064 0, /*nb_subtract*/
7065 0, /*nb_multiply*/
7066 0, /*nb_divide*/
7067 unicode_mod, /*nb_remainder*/
7068};
7069
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007071 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007072 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007073 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7074 (ssizeargfunc) unicode_getitem, /* sq_item */
7075 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076 0, /* sq_ass_item */
7077 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007078 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079};
7080
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007081static PyObject*
7082unicode_subscript(PyUnicodeObject* self, PyObject* item)
7083{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007084 if (PyIndex_Check(item)) {
7085 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007086 if (i == -1 && PyErr_Occurred())
7087 return NULL;
7088 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007089 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007090 return unicode_getitem(self, i);
7091 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007092 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007093 Py_UNICODE* source_buf;
7094 Py_UNICODE* result_buf;
7095 PyObject* result;
7096
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007097 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007098 &start, &stop, &step, &slicelength) < 0) {
7099 return NULL;
7100 }
7101
7102 if (slicelength <= 0) {
7103 return PyUnicode_FromUnicode(NULL, 0);
7104 } else {
7105 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00007106 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7107 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007108
7109 if (result_buf == NULL)
7110 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007111
7112 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7113 result_buf[i] = source_buf[cur];
7114 }
Tim Petersced69f82003-09-16 20:30:58 +00007115
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007116 result = PyUnicode_FromUnicode(result_buf, slicelength);
7117 PyMem_FREE(result_buf);
7118 return result;
7119 }
7120 } else {
7121 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7122 return NULL;
7123 }
7124}
7125
7126static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007127 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007128 (binaryfunc)unicode_subscript, /* mp_subscript */
7129 (objobjargproc)0, /* mp_ass_subscript */
7130};
7131
Martin v. Löwis18e16552006-02-15 17:27:45 +00007132static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007134 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135 const void **ptr)
7136{
7137 if (index != 0) {
7138 PyErr_SetString(PyExc_SystemError,
7139 "accessing non-existent unicode segment");
7140 return -1;
7141 }
7142 *ptr = (void *) self->str;
7143 return PyUnicode_GET_DATA_SIZE(self);
7144}
7145
Martin v. Löwis18e16552006-02-15 17:27:45 +00007146static Py_ssize_t
7147unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148 const void **ptr)
7149{
7150 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007151 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 return -1;
7153}
7154
7155static int
7156unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007157 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158{
7159 if (lenp)
7160 *lenp = PyUnicode_GET_DATA_SIZE(self);
7161 return 1;
7162}
7163
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007164static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007166 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167 const void **ptr)
7168{
7169 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007170
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171 if (index != 0) {
7172 PyErr_SetString(PyExc_SystemError,
7173 "accessing non-existent unicode segment");
7174 return -1;
7175 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007176 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177 if (str == NULL)
7178 return -1;
7179 *ptr = (void *) PyString_AS_STRING(str);
7180 return PyString_GET_SIZE(str);
7181}
7182
7183/* Helpers for PyUnicode_Format() */
7184
7185static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007186getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007188 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189 if (argidx < arglen) {
7190 (*p_argidx)++;
7191 if (arglen < 0)
7192 return args;
7193 else
7194 return PyTuple_GetItem(args, argidx);
7195 }
7196 PyErr_SetString(PyExc_TypeError,
7197 "not enough arguments for format string");
7198 return NULL;
7199}
7200
7201#define F_LJUST (1<<0)
7202#define F_SIGN (1<<1)
7203#define F_BLANK (1<<2)
7204#define F_ALT (1<<3)
7205#define F_ZERO (1<<4)
7206
Martin v. Löwis18e16552006-02-15 17:27:45 +00007207static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007208strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007210 register Py_ssize_t i;
7211 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212 for (i = len - 1; i >= 0; i--)
7213 buffer[i] = (Py_UNICODE) charbuffer[i];
7214
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 return len;
7216}
7217
Neal Norwitzfc76d632006-01-10 06:03:13 +00007218static int
7219doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7220{
Tim Peters15231542006-02-16 01:08:01 +00007221 Py_ssize_t result;
7222
Neal Norwitzfc76d632006-01-10 06:03:13 +00007223 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007224 result = strtounicode(buffer, (char *)buffer);
7225 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007226}
7227
7228static int
7229longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7230{
Tim Peters15231542006-02-16 01:08:01 +00007231 Py_ssize_t result;
7232
Neal Norwitzfc76d632006-01-10 06:03:13 +00007233 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007234 result = strtounicode(buffer, (char *)buffer);
7235 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007236}
7237
Guido van Rossum078151d2002-08-11 04:24:12 +00007238/* XXX To save some code duplication, formatfloat/long/int could have been
7239 shared with stringobject.c, converting from 8-bit to Unicode after the
7240 formatting is done. */
7241
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242static int
7243formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007244 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245 int flags,
7246 int prec,
7247 int type,
7248 PyObject *v)
7249{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007250 /* fmt = '%#.' + `prec` + `type`
7251 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252 char fmt[20];
7253 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007254
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255 x = PyFloat_AsDouble(v);
7256 if (x == -1.0 && PyErr_Occurred())
7257 return -1;
7258 if (prec < 0)
7259 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7261 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007262 /* Worst case length calc to ensure no buffer overrun:
7263
7264 'g' formats:
7265 fmt = %#.<prec>g
7266 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7267 for any double rep.)
7268 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7269
7270 'f' formats:
7271 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7272 len = 1 + 50 + 1 + prec = 52 + prec
7273
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007274 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007275 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007276
7277 */
7278 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7279 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007280 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007281 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007282 return -1;
7283 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007284 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7285 (flags&F_ALT) ? "#" : "",
7286 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007287 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288}
7289
Tim Peters38fd5b62000-09-21 05:43:11 +00007290static PyObject*
7291formatlong(PyObject *val, int flags, int prec, int type)
7292{
7293 char *buf;
7294 int i, len;
7295 PyObject *str; /* temporary string object. */
7296 PyUnicodeObject *result;
7297
7298 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7299 if (!str)
7300 return NULL;
7301 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007302 if (!result) {
7303 Py_DECREF(str);
7304 return NULL;
7305 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007306 for (i = 0; i < len; i++)
7307 result->str[i] = buf[i];
7308 result->str[len] = 0;
7309 Py_DECREF(str);
7310 return (PyObject*)result;
7311}
7312
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313static int
7314formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007315 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316 int flags,
7317 int prec,
7318 int type,
7319 PyObject *v)
7320{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007321 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007322 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7323 * + 1 + 1
7324 * = 24
7325 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007326 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007327 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328 long x;
7329
7330 x = PyInt_AsLong(v);
7331 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007332 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007333 if (x < 0 && type == 'u') {
7334 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007335 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007336 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7337 sign = "-";
7338 else
7339 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007341 prec = 1;
7342
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007343 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7344 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007345 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007346 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007347 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007348 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007349 return -1;
7350 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007351
7352 if ((flags & F_ALT) &&
7353 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007354 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007355 * of issues that cause pain:
7356 * - when 0 is being converted, the C standard leaves off
7357 * the '0x' or '0X', which is inconsistent with other
7358 * %#x/%#X conversions and inconsistent with Python's
7359 * hex() function
7360 * - there are platforms that violate the standard and
7361 * convert 0 with the '0x' or '0X'
7362 * (Metrowerks, Compaq Tru64)
7363 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007364 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007365 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007366 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007367 * We can achieve the desired consistency by inserting our
7368 * own '0x' or '0X' prefix, and substituting %x/%X in place
7369 * of %#x/%#X.
7370 *
7371 * Note that this is the same approach as used in
7372 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007373 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007374 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7375 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007376 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007377 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007378 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7379 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007380 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007381 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007382 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007383 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007384 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007385 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386}
7387
7388static int
7389formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007390 size_t buflen,
7391 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007393 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007394 if (PyUnicode_Check(v)) {
7395 if (PyUnicode_GET_SIZE(v) != 1)
7396 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007398 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007400 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007401 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007402 goto onError;
7403 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7404 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405
7406 else {
7407 /* Integer input truncated to a character */
7408 long x;
7409 x = PyInt_AsLong(v);
7410 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007411 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007412#ifdef Py_UNICODE_WIDE
7413 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007414 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007415 "%c arg not in range(0x110000) "
7416 "(wide Python build)");
7417 return -1;
7418 }
7419#else
7420 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007421 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007422 "%c arg not in range(0x10000) "
7423 "(narrow Python build)");
7424 return -1;
7425 }
7426#endif
7427 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428 }
7429 buf[1] = '\0';
7430 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007431
7432 onError:
7433 PyErr_SetString(PyExc_TypeError,
7434 "%c requires int or char");
7435 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436}
7437
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007438/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7439
7440 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7441 chars are formatted. XXX This is a magic number. Each formatting
7442 routine does bounds checking to ensure no overflow, but a better
7443 solution may be to malloc a buffer of appropriate size for each
7444 format. For now, the current solution is sufficient.
7445*/
7446#define FORMATBUFLEN (size_t)120
7447
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448PyObject *PyUnicode_Format(PyObject *format,
7449 PyObject *args)
7450{
7451 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007452 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 int args_owned = 0;
7454 PyUnicodeObject *result = NULL;
7455 PyObject *dict = NULL;
7456 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007457
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458 if (format == NULL || args == NULL) {
7459 PyErr_BadInternalCall();
7460 return NULL;
7461 }
7462 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007463 if (uformat == NULL)
7464 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465 fmt = PyUnicode_AS_UNICODE(uformat);
7466 fmtcnt = PyUnicode_GET_SIZE(uformat);
7467
7468 reslen = rescnt = fmtcnt + 100;
7469 result = _PyUnicode_New(reslen);
7470 if (result == NULL)
7471 goto onError;
7472 res = PyUnicode_AS_UNICODE(result);
7473
7474 if (PyTuple_Check(args)) {
7475 arglen = PyTuple_Size(args);
7476 argidx = 0;
7477 }
7478 else {
7479 arglen = -1;
7480 argidx = -2;
7481 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007482 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7483 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484 dict = args;
7485
7486 while (--fmtcnt >= 0) {
7487 if (*fmt != '%') {
7488 if (--rescnt < 0) {
7489 rescnt = fmtcnt + 100;
7490 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007491 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007492 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7494 --rescnt;
7495 }
7496 *res++ = *fmt++;
7497 }
7498 else {
7499 /* Got a format specifier */
7500 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007501 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503 Py_UNICODE c = '\0';
7504 Py_UNICODE fill;
7505 PyObject *v = NULL;
7506 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007507 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007509 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007510 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511
7512 fmt++;
7513 if (*fmt == '(') {
7514 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007515 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516 PyObject *key;
7517 int pcount = 1;
7518
7519 if (dict == NULL) {
7520 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007521 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522 goto onError;
7523 }
7524 ++fmt;
7525 --fmtcnt;
7526 keystart = fmt;
7527 /* Skip over balanced parentheses */
7528 while (pcount > 0 && --fmtcnt >= 0) {
7529 if (*fmt == ')')
7530 --pcount;
7531 else if (*fmt == '(')
7532 ++pcount;
7533 fmt++;
7534 }
7535 keylen = fmt - keystart - 1;
7536 if (fmtcnt < 0 || pcount > 0) {
7537 PyErr_SetString(PyExc_ValueError,
7538 "incomplete format key");
7539 goto onError;
7540 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007541#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007542 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543 then looked up since Python uses strings to hold
7544 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007545 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546 key = PyUnicode_EncodeUTF8(keystart,
7547 keylen,
7548 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007549#else
7550 key = PyUnicode_FromUnicode(keystart, keylen);
7551#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 if (key == NULL)
7553 goto onError;
7554 if (args_owned) {
7555 Py_DECREF(args);
7556 args_owned = 0;
7557 }
7558 args = PyObject_GetItem(dict, key);
7559 Py_DECREF(key);
7560 if (args == NULL) {
7561 goto onError;
7562 }
7563 args_owned = 1;
7564 arglen = -1;
7565 argidx = -2;
7566 }
7567 while (--fmtcnt >= 0) {
7568 switch (c = *fmt++) {
7569 case '-': flags |= F_LJUST; continue;
7570 case '+': flags |= F_SIGN; continue;
7571 case ' ': flags |= F_BLANK; continue;
7572 case '#': flags |= F_ALT; continue;
7573 case '0': flags |= F_ZERO; continue;
7574 }
7575 break;
7576 }
7577 if (c == '*') {
7578 v = getnextarg(args, arglen, &argidx);
7579 if (v == NULL)
7580 goto onError;
7581 if (!PyInt_Check(v)) {
7582 PyErr_SetString(PyExc_TypeError,
7583 "* wants int");
7584 goto onError;
7585 }
7586 width = PyInt_AsLong(v);
7587 if (width < 0) {
7588 flags |= F_LJUST;
7589 width = -width;
7590 }
7591 if (--fmtcnt >= 0)
7592 c = *fmt++;
7593 }
7594 else if (c >= '0' && c <= '9') {
7595 width = c - '0';
7596 while (--fmtcnt >= 0) {
7597 c = *fmt++;
7598 if (c < '0' || c > '9')
7599 break;
7600 if ((width*10) / 10 != width) {
7601 PyErr_SetString(PyExc_ValueError,
7602 "width too big");
7603 goto onError;
7604 }
7605 width = width*10 + (c - '0');
7606 }
7607 }
7608 if (c == '.') {
7609 prec = 0;
7610 if (--fmtcnt >= 0)
7611 c = *fmt++;
7612 if (c == '*') {
7613 v = getnextarg(args, arglen, &argidx);
7614 if (v == NULL)
7615 goto onError;
7616 if (!PyInt_Check(v)) {
7617 PyErr_SetString(PyExc_TypeError,
7618 "* wants int");
7619 goto onError;
7620 }
7621 prec = PyInt_AsLong(v);
7622 if (prec < 0)
7623 prec = 0;
7624 if (--fmtcnt >= 0)
7625 c = *fmt++;
7626 }
7627 else if (c >= '0' && c <= '9') {
7628 prec = c - '0';
7629 while (--fmtcnt >= 0) {
7630 c = Py_CHARMASK(*fmt++);
7631 if (c < '0' || c > '9')
7632 break;
7633 if ((prec*10) / 10 != prec) {
7634 PyErr_SetString(PyExc_ValueError,
7635 "prec too big");
7636 goto onError;
7637 }
7638 prec = prec*10 + (c - '0');
7639 }
7640 }
7641 } /* prec */
7642 if (fmtcnt >= 0) {
7643 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644 if (--fmtcnt >= 0)
7645 c = *fmt++;
7646 }
7647 }
7648 if (fmtcnt < 0) {
7649 PyErr_SetString(PyExc_ValueError,
7650 "incomplete format");
7651 goto onError;
7652 }
7653 if (c != '%') {
7654 v = getnextarg(args, arglen, &argidx);
7655 if (v == NULL)
7656 goto onError;
7657 }
7658 sign = 0;
7659 fill = ' ';
7660 switch (c) {
7661
7662 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007663 pbuf = formatbuf;
7664 /* presume that buffer length is at least 1 */
7665 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666 len = 1;
7667 break;
7668
7669 case 's':
7670 case 'r':
7671 if (PyUnicode_Check(v) && c == 's') {
7672 temp = v;
7673 Py_INCREF(temp);
7674 }
7675 else {
7676 PyObject *unicode;
7677 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007678 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679 else
7680 temp = PyObject_Repr(v);
7681 if (temp == NULL)
7682 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007683 if (PyUnicode_Check(temp))
7684 /* nothing to do */;
7685 else if (PyString_Check(temp)) {
7686 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007687 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007689 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007691 Py_DECREF(temp);
7692 temp = unicode;
7693 if (temp == NULL)
7694 goto onError;
7695 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007696 else {
7697 Py_DECREF(temp);
7698 PyErr_SetString(PyExc_TypeError,
7699 "%s argument has non-string str()");
7700 goto onError;
7701 }
7702 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007703 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704 len = PyUnicode_GET_SIZE(temp);
7705 if (prec >= 0 && len > prec)
7706 len = prec;
7707 break;
7708
7709 case 'i':
7710 case 'd':
7711 case 'u':
7712 case 'o':
7713 case 'x':
7714 case 'X':
7715 if (c == 'i')
7716 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007717 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007718 temp = formatlong(v, flags, prec, c);
7719 if (!temp)
7720 goto onError;
7721 pbuf = PyUnicode_AS_UNICODE(temp);
7722 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007723 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007725 else {
7726 pbuf = formatbuf;
7727 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7728 flags, prec, c, v);
7729 if (len < 0)
7730 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007731 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007732 }
7733 if (flags & F_ZERO)
7734 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735 break;
7736
7737 case 'e':
7738 case 'E':
7739 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007740 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741 case 'g':
7742 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007743 if (c == 'F')
7744 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007745 pbuf = formatbuf;
7746 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7747 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 if (len < 0)
7749 goto onError;
7750 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007751 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752 fill = '0';
7753 break;
7754
7755 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007756 pbuf = formatbuf;
7757 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758 if (len < 0)
7759 goto onError;
7760 break;
7761
7762 default:
7763 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007764 "unsupported format character '%c' (0x%x) "
7765 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007766 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007767 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007768 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769 goto onError;
7770 }
7771 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007772 if (*pbuf == '-' || *pbuf == '+') {
7773 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774 len--;
7775 }
7776 else if (flags & F_SIGN)
7777 sign = '+';
7778 else if (flags & F_BLANK)
7779 sign = ' ';
7780 else
7781 sign = 0;
7782 }
7783 if (width < len)
7784 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007785 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786 reslen -= rescnt;
7787 rescnt = width + fmtcnt + 100;
7788 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007789 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007790 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007791 PyErr_NoMemory();
7792 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007793 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007794 if (_PyUnicode_Resize(&result, reslen) < 0) {
7795 Py_XDECREF(temp);
7796 goto onError;
7797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798 res = PyUnicode_AS_UNICODE(result)
7799 + reslen - rescnt;
7800 }
7801 if (sign) {
7802 if (fill != ' ')
7803 *res++ = sign;
7804 rescnt--;
7805 if (width > len)
7806 width--;
7807 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007808 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7809 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007810 assert(pbuf[1] == c);
7811 if (fill != ' ') {
7812 *res++ = *pbuf++;
7813 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007814 }
Tim Petersfff53252001-04-12 18:38:48 +00007815 rescnt -= 2;
7816 width -= 2;
7817 if (width < 0)
7818 width = 0;
7819 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007820 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821 if (width > len && !(flags & F_LJUST)) {
7822 do {
7823 --rescnt;
7824 *res++ = fill;
7825 } while (--width > len);
7826 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007827 if (fill == ' ') {
7828 if (sign)
7829 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007830 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007831 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007832 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007833 *res++ = *pbuf++;
7834 *res++ = *pbuf++;
7835 }
7836 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007837 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 res += len;
7839 rescnt -= len;
7840 while (--width >= len) {
7841 --rescnt;
7842 *res++ = ' ';
7843 }
7844 if (dict && (argidx < arglen) && c != '%') {
7845 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007846 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007847 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848 goto onError;
7849 }
7850 Py_XDECREF(temp);
7851 } /* '%' */
7852 } /* until end */
7853 if (argidx < arglen && !dict) {
7854 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007855 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856 goto onError;
7857 }
7858
Thomas Woutersa96affe2006-03-12 00:29:36 +00007859 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7860 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861 if (args_owned) {
7862 Py_DECREF(args);
7863 }
7864 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865 return (PyObject *)result;
7866
7867 onError:
7868 Py_XDECREF(result);
7869 Py_DECREF(uformat);
7870 if (args_owned) {
7871 Py_DECREF(args);
7872 }
7873 return NULL;
7874}
7875
7876static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007877 (readbufferproc) unicode_buffer_getreadbuf,
7878 (writebufferproc) unicode_buffer_getwritebuf,
7879 (segcountproc) unicode_buffer_getsegcount,
7880 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007881};
7882
Jeremy Hylton938ace62002-07-17 16:30:39 +00007883static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007884unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7885
Tim Peters6d6c1a32001-08-02 04:15:00 +00007886static PyObject *
7887unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7888{
7889 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007890 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007891 char *encoding = NULL;
7892 char *errors = NULL;
7893
Guido van Rossume023fe02001-08-30 03:12:59 +00007894 if (type != &PyUnicode_Type)
7895 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007896 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7897 kwlist, &x, &encoding, &errors))
7898 return NULL;
7899 if (x == NULL)
7900 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007901 if (encoding == NULL && errors == NULL)
7902 return PyObject_Unicode(x);
7903 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007904 return PyUnicode_FromEncodedObject(x, encoding, errors);
7905}
7906
Guido van Rossume023fe02001-08-30 03:12:59 +00007907static PyObject *
7908unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7909{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007910 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007911 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007912
7913 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7914 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7915 if (tmp == NULL)
7916 return NULL;
7917 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007918 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007919 if (pnew == NULL) {
7920 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007921 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007922 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007923 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7924 if (pnew->str == NULL) {
7925 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007926 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007927 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007928 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007929 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007930 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7931 pnew->length = n;
7932 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007933 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007934 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007935}
7936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007937PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007938"unicode(string [, encoding[, errors]]) -> object\n\
7939\n\
7940Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007941encoding defaults to the current default string encoding.\n\
7942errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007943
Guido van Rossumd57fd912000-03-10 22:53:23 +00007944PyTypeObject PyUnicode_Type = {
7945 PyObject_HEAD_INIT(&PyType_Type)
7946 0, /* ob_size */
7947 "unicode", /* tp_name */
7948 sizeof(PyUnicodeObject), /* tp_size */
7949 0, /* tp_itemsize */
7950 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007951 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007953 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00007955 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007956 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007957 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007959 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960 (hashfunc) unicode_hash, /* tp_hash*/
7961 0, /* tp_call*/
7962 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007963 PyObject_GenericGetAttr, /* tp_getattro */
7964 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007966 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7967 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007968 unicode_doc, /* tp_doc */
7969 0, /* tp_traverse */
7970 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00007971 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007972 0, /* tp_weaklistoffset */
7973 0, /* tp_iter */
7974 0, /* tp_iternext */
7975 unicode_methods, /* tp_methods */
7976 0, /* tp_members */
7977 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007978 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007979 0, /* tp_dict */
7980 0, /* tp_descr_get */
7981 0, /* tp_descr_set */
7982 0, /* tp_dictoffset */
7983 0, /* tp_init */
7984 0, /* tp_alloc */
7985 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007986 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987};
7988
7989/* Initialize the Unicode implementation */
7990
Thomas Wouters78890102000-07-22 19:25:51 +00007991void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007993 int i;
7994
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007995 /* XXX - move this array to unicodectype.c ? */
7996 Py_UNICODE linebreak[] = {
7997 0x000A, /* LINE FEED */
7998 0x000D, /* CARRIAGE RETURN */
7999 0x001C, /* FILE SEPARATOR */
8000 0x001D, /* GROUP SEPARATOR */
8001 0x001E, /* RECORD SEPARATOR */
8002 0x0085, /* NEXT LINE */
8003 0x2028, /* LINE SEPARATOR */
8004 0x2029, /* PARAGRAPH SEPARATOR */
8005 };
8006
Fred Drakee4315f52000-05-09 19:53:39 +00008007 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008008 unicode_freelist = NULL;
8009 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008011 if (!unicode_empty)
8012 return;
8013
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008014 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008015 for (i = 0; i < 256; i++)
8016 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008017 if (PyType_Ready(&PyUnicode_Type) < 0)
8018 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008019
8020 /* initialize the linebreak bloom filter */
8021 bloom_linebreak = make_bloom_mask(
8022 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8023 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008024
8025 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026}
8027
8028/* Finalize the Unicode implementation */
8029
8030void
Thomas Wouters78890102000-07-22 19:25:51 +00008031_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008033 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008034 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008036 Py_XDECREF(unicode_empty);
8037 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008038
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008039 for (i = 0; i < 256; i++) {
8040 if (unicode_latin1[i]) {
8041 Py_DECREF(unicode_latin1[i]);
8042 unicode_latin1[i] = NULL;
8043 }
8044 }
8045
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008046 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047 PyUnicodeObject *v = u;
8048 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008049 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008050 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008051 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008052 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008054 unicode_freelist = NULL;
8055 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008057
Anthony Baxterac6bd462006-04-13 02:06:09 +00008058#ifdef __cplusplus
8059}
8060#endif
8061
8062
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008063/*
8064Local variables:
8065c-basic-offset: 4
8066indent-tabs-mode: nil
8067End:
8068*/