blob: 064caebd50c8a1a0508cab19a9b3bd4ae9a90113 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
Fredrik Lundh77633512006-05-23 19:47:35 +0000166 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000187
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000199 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000205 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 PyErr_NoMemory();
207 return -1;
208 }
209 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000210 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000212 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 }
218 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000219
Guido van Rossumd57fd912000-03-10 22:53:23 +0000220 return 0;
221}
222
223/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000224 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
228
229*/
230
231static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233{
234 register PyUnicodeObject *unicode;
235
Andrew Dalkee0df7622006-05-27 11:04:36 +0000236 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
240 }
241
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000245 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000250 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000251 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 }
255 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000256 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000258 }
259 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266 }
267
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000268 if (!unicode->str) {
269 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000270 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000271 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000272 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
278 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000283 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000288 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290}
291
292static
Guido van Rossum9475a232001-10-05 20:51:39 +0000293void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000299 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 unicode->str = NULL;
301 unicode->length = 0;
302 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000306 }
307 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000313 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000314 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000315 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317}
318
Martin v. Löwis18e16552006-02-15 17:27:45 +0000319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000320{
321 register PyUnicodeObject *v;
322
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
327 }
328 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000329 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 PyErr_BadInternalCall();
331 return -1;
332 }
333
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000337 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000344 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000345 *unicode = (PyObject *)w;
346 return 0;
347 }
348
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360{
361 PyUnicodeObject *unicode;
362
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
366
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
371 }
372
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 if (!unicode)
380 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000381 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000382 unicode_latin1[*u] = unicode;
383 }
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
386 }
387 }
Tim Petersced69f82003-09-16 20:30:58 +0000388
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
392
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396
397 return (PyObject *)unicode;
398}
399
400#ifdef HAVE_WCHAR_H
401
402PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000403 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404{
405 PyUnicodeObject *unicode;
406
407 if (w == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
411
412 unicode = _PyUnicode_New(size);
413 if (!unicode)
414 return NULL;
415
416 /* Copy the wchar_t data into the new object */
417#ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000419#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000420 {
421 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000422 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000424 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 *u++ = *w++;
426 }
427#endif
428
429 return (PyObject *)unicode;
430}
431
Martin v. Löwis18e16552006-02-15 17:27:45 +0000432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433 wchar_t *w,
434 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435{
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
439 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000440
441 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000443 size = PyUnicode_GET_SIZE(unicode) + 1;
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445#ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w, unicode->str, size * sizeof(wchar_t));
447#else
448 {
449 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000450 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000452 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453 *w++ = *u++;
454 }
455#endif
456
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000457 if (size > PyUnicode_GET_SIZE(unicode))
458 return PyUnicode_GET_SIZE(unicode);
459 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 return size;
461}
462
463#endif
464
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000465PyObject *PyUnicode_FromOrdinal(int ordinal)
466{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000467 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000468
469#ifdef Py_UNICODE_WIDE
470 if (ordinal < 0 || ordinal > 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
474 return NULL;
475 }
476#else
477 if (ordinal < 0 || ordinal > 0xffff) {
478 PyErr_SetString(PyExc_ValueError,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
481 return NULL;
482 }
483#endif
484
Hye-Shik Chang40574832004-04-06 07:24:51 +0000485 s[0] = (Py_UNICODE)ordinal;
486 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000487}
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489PyObject *PyUnicode_FromObject(register PyObject *obj)
490{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj)) {
494 Py_INCREF(obj);
495 return obj;
496 }
497 if (PyUnicode_Check(obj)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501 PyUnicode_GET_SIZE(obj));
502 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000503 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504}
505
506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507 const char *encoding,
508 const char *errors)
509{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000511 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 if (obj == NULL) {
515 PyErr_BadInternalCall();
516 return NULL;
517 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000518
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000519#if 0
520 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
523 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000524
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
527
528 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529 if (PyUnicode_Check(obj)) {
530 if (encoding) {
531 PyErr_SetString(PyExc_TypeError,
532 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000533 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000536 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537#else
538 if (PyUnicode_Check(obj)) {
539 PyErr_SetString(PyExc_TypeError,
540 "decoding Unicode is not supported");
541 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000542 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000543#endif
544
545 /* Coerce object */
546 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000547 s = PyString_AS_STRING(obj);
548 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000549 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000550 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 "coercing to Unicode: need string or buffer, "
556 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000557 obj->ob_type->tp_name);
558 goto onError;
559 }
Tim Petersced69f82003-09-16 20:30:58 +0000560
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000561 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 if (len == 0) {
563 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000564 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 }
Tim Petersced69f82003-09-16 20:30:58 +0000566 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000567 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000568
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 return v;
570
571 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573}
574
575PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000576 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 const char *encoding,
578 const char *errors)
579{
580 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000581
582 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000583 encoding = PyUnicode_GetDefaultEncoding();
584
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000587 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000588 else if (strcmp(encoding, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s, size, errors);
593#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596
597 /* Decode via the codec registry */
598 buffer = PyBuffer_FromMemory((void *)s, size);
599 if (buffer == NULL)
600 goto onError;
601 unicode = PyCodec_Decode(buffer, encoding, errors);
602 if (unicode == NULL)
603 goto onError;
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000606 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607 unicode->ob_type->tp_name);
608 Py_DECREF(unicode);
609 goto onError;
610 }
611 Py_DECREF(buffer);
612 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000613
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 onError:
615 Py_XDECREF(buffer);
616 return NULL;
617}
618
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620 const char *encoding,
621 const char *errors)
622{
623 PyObject *v;
624
625 if (!PyUnicode_Check(unicode)) {
626 PyErr_BadArgument();
627 goto onError;
628 }
629
630 if (encoding == NULL)
631 encoding = PyUnicode_GetDefaultEncoding();
632
633 /* Decode via the codec registry */
634 v = PyCodec_Decode(unicode, encoding, errors);
635 if (v == NULL)
636 goto onError;
637 return v;
638
639 onError:
640 return NULL;
641}
642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 const char *encoding,
646 const char *errors)
647{
648 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = PyUnicode_FromUnicode(s, size);
651 if (unicode == NULL)
652 return NULL;
653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654 Py_DECREF(unicode);
655 return v;
656}
657
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659 const char *encoding,
660 const char *errors)
661{
662 PyObject *v;
663
664 if (!PyUnicode_Check(unicode)) {
665 PyErr_BadArgument();
666 goto onError;
667 }
668
669 if (encoding == NULL)
670 encoding = PyUnicode_GetDefaultEncoding();
671
672 /* Encode via the codec registry */
673 v = PyCodec_Encode(unicode, encoding, errors);
674 if (v == NULL)
675 goto onError;
676 return v;
677
678 onError:
679 return NULL;
680}
681
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683 const char *encoding,
684 const char *errors)
685{
686 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000687
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688 if (!PyUnicode_Check(unicode)) {
689 PyErr_BadArgument();
690 goto onError;
691 }
Fred Drakee4315f52000-05-09 19:53:39 +0000692
Tim Petersced69f82003-09-16 20:30:58 +0000693 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000694 encoding = PyUnicode_GetDefaultEncoding();
695
696 /* Shortcuts for common default encodings */
697 if (errors == NULL) {
698 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000699 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000700 else if (strcmp(encoding, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode);
705#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000706 else if (strcmp(encoding, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode);
708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709
710 /* Encode via the codec registry */
711 v = PyCodec_Encode(unicode, encoding, errors);
712 if (v == NULL)
713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 if (!PyString_Check(v)) {
715 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000716 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 v->ob_type->tp_name);
718 Py_DECREF(v);
719 goto onError;
720 }
721 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000722
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 onError:
724 return NULL;
725}
726
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728 const char *errors)
729{
730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
731
732 if (v)
733 return v;
734 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735 if (v && errors == NULL)
736 ((PyUnicodeObject *)unicode)->defenc = v;
737 return v;
738}
739
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
741{
742 if (!PyUnicode_Check(unicode)) {
743 PyErr_BadArgument();
744 goto onError;
745 }
746 return PyUnicode_AS_UNICODE(unicode);
747
748 onError:
749 return NULL;
750}
751
Martin v. Löwis18e16552006-02-15 17:27:45 +0000752Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753{
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
757 }
758 return PyUnicode_GET_SIZE(unicode);
759
760 onError:
761 return -1;
762}
763
Thomas Wouters78890102000-07-22 19:25:51 +0000764const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000765{
766 return unicode_default_encoding;
767}
768
769int PyUnicode_SetDefaultEncoding(const char *encoding)
770{
771 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000772
Fred Drakee4315f52000-05-09 19:53:39 +0000773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v = _PyCodec_Lookup(encoding);
776 if (v == NULL)
777 goto onError;
778 Py_DECREF(v);
779 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000780 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000781 sizeof(unicode_default_encoding));
782 return 0;
783
784 onError:
785 return -1;
786}
787
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000788/* error handling callback helper:
789 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000790 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000791 and adjust various state variables.
792 return 0 on success, -1 on error
793*/
794
795static
796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
799 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000800{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000801 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000802
803 PyObject *restuple = NULL;
804 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000805 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
806 Py_ssize_t requiredsize;
807 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000808 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000809 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000810 int res = -1;
811
812 if (*errorHandler == NULL) {
813 *errorHandler = PyCodec_LookupError(errors);
814 if (*errorHandler == NULL)
815 goto onError;
816 }
817
818 if (*exceptionObject == NULL) {
819 *exceptionObject = PyUnicodeDecodeError_Create(
820 encoding, input, insize, *startinpos, *endinpos, reason);
821 if (*exceptionObject == NULL)
822 goto onError;
823 }
824 else {
825 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
826 goto onError;
827 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
828 goto onError;
829 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
830 goto onError;
831 }
832
833 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
834 if (restuple == NULL)
835 goto onError;
836 if (!PyTuple_Check(restuple)) {
837 PyErr_Format(PyExc_TypeError, &argparse[4]);
838 goto onError;
839 }
840 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
841 goto onError;
842 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000843 newpos = insize+newpos;
844 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000845 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000846 goto onError;
847 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000848
849 /* need more space? (at least enough for what we
850 have+the replacement+the rest of the string (starting
851 at the new input position), so we won't have to check space
852 when there are no errors in the rest of the string) */
853 repptr = PyUnicode_AS_UNICODE(repunicode);
854 repsize = PyUnicode_GET_SIZE(repunicode);
855 requiredsize = *outpos + repsize + insize-newpos;
856 if (requiredsize > outsize) {
857 if (requiredsize<2*outsize)
858 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000859 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000860 goto onError;
861 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
862 }
863 *endinpos = newpos;
864 *inptr = input + newpos;
865 Py_UNICODE_COPY(*outptr, repptr, repsize);
866 *outptr += repsize;
867 *outpos += repsize;
868 /* we made it! */
869 res = 0;
870
871 onError:
872 Py_XDECREF(restuple);
873 return res;
874}
875
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000876/* --- UTF-7 Codec -------------------------------------------------------- */
877
878/* see RFC2152 for details */
879
Tim Petersced69f82003-09-16 20:30:58 +0000880static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881char utf7_special[128] = {
882 /* indicate whether a UTF-7 character is special i.e. cannot be directly
883 encoded:
884 0 - not special
885 1 - special
886 2 - whitespace (optional)
887 3 - RFC2152 Set O (optional) */
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
892 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
894 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
896
897};
898
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000899/* Note: The comparison (c) <= 0 is a trick to work-around gcc
900 warnings about the comparison always being false; since
901 utf7_special[0] is 1, we can safely make that one comparison
902 true */
903
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000904#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000905 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000906 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907 (encodeO && (utf7_special[(c)] == 3)))
908
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000909#define B64(n) \
910 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
911#define B64CHAR(c) \
912 (isalnum(c) || (c) == '+' || (c) == '/')
913#define UB64(c) \
914 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
915 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000916
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000917#define ENCODE(out, ch, bits) \
918 while (bits >= 6) { \
919 *out++ = B64(ch >> (bits-6)); \
920 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921 }
922
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000923#define DECODE(out, ch, bits, surrogate) \
924 while (bits >= 16) { \
925 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
926 bits -= 16; \
927 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000928 /* We have already generated an error for the high surrogate \
929 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000930 surrogate = 0; \
931 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000932 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000933 it in a 16-bit character */ \
934 surrogate = 1; \
935 errmsg = "code pairs are not supported"; \
936 goto utf7Error; \
937 } else { \
938 *out++ = outCh; \
939 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000940 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000941
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000943 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 const char *errors)
945{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000946 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000947 Py_ssize_t startinpos;
948 Py_ssize_t endinpos;
949 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950 const char *e;
951 PyUnicodeObject *unicode;
952 Py_UNICODE *p;
953 const char *errmsg = "";
954 int inShift = 0;
955 unsigned int bitsleft = 0;
956 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000957 int surrogate = 0;
958 PyObject *errorHandler = NULL;
959 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000960
961 unicode = _PyUnicode_New(size);
962 if (!unicode)
963 return NULL;
964 if (size == 0)
965 return (PyObject *)unicode;
966
967 p = unicode->str;
968 e = s + size;
969
970 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000971 Py_UNICODE ch;
972 restart:
973 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974
975 if (inShift) {
976 if ((ch == '-') || !B64CHAR(ch)) {
977 inShift = 0;
978 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000979
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
981 if (bitsleft >= 6) {
982 /* The shift sequence has a partial character in it. If
983 bitsleft < 6 then we could just classify it as padding
984 but that is not the case here */
985
986 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000987 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000988 }
989 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000990 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000991 here so indicate the potential of a misencoded character. */
992
993 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
994 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
995 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000996 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 }
998
999 if (ch == '-') {
1000 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001001 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 inShift = 1;
1003 }
1004 } else if (SPECIAL(ch,0,0)) {
1005 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001006 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 } else {
1008 *p++ = ch;
1009 }
1010 } else {
1011 charsleft = (charsleft << 6) | UB64(ch);
1012 bitsleft += 6;
1013 s++;
1014 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1015 }
1016 }
1017 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001019 s++;
1020 if (s < e && *s == '-') {
1021 s++;
1022 *p++ = '+';
1023 } else
1024 {
1025 inShift = 1;
1026 bitsleft = 0;
1027 }
1028 }
1029 else if (SPECIAL(ch,0,0)) {
1030 errmsg = "unexpected special character";
1031 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001032 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001033 }
1034 else {
1035 *p++ = ch;
1036 s++;
1037 }
1038 continue;
1039 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001040 outpos = p-PyUnicode_AS_UNICODE(unicode);
1041 endinpos = s-starts;
1042 if (unicode_decode_call_errorhandler(
1043 errors, &errorHandler,
1044 "utf7", errmsg,
1045 starts, size, &startinpos, &endinpos, &exc, &s,
1046 (PyObject **)&unicode, &outpos, &p))
1047 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001048 }
1049
1050 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001051 outpos = p-PyUnicode_AS_UNICODE(unicode);
1052 endinpos = size;
1053 if (unicode_decode_call_errorhandler(
1054 errors, &errorHandler,
1055 "utf7", "unterminated shift sequence",
1056 starts, size, &startinpos, &endinpos, &exc, &s,
1057 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001059 if (s < e)
1060 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001063 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001064 goto onError;
1065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001066 Py_XDECREF(errorHandler);
1067 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 return (PyObject *)unicode;
1069
1070onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071 Py_XDECREF(errorHandler);
1072 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 Py_DECREF(unicode);
1074 return NULL;
1075}
1076
1077
1078PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001079 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080 int encodeSetO,
1081 int encodeWhiteSpace,
1082 const char *errors)
1083{
1084 PyObject *v;
1085 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001086 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001087 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001089 unsigned int bitsleft = 0;
1090 unsigned long charsleft = 0;
1091 char * out;
1092 char * start;
1093
1094 if (size == 0)
1095 return PyString_FromStringAndSize(NULL, 0);
1096
1097 v = PyString_FromStringAndSize(NULL, cbAllocated);
1098 if (v == NULL)
1099 return NULL;
1100
1101 start = out = PyString_AS_STRING(v);
1102 for (;i < size; ++i) {
1103 Py_UNICODE ch = s[i];
1104
1105 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001106 if (ch == '+') {
1107 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001108 *out++ = '-';
1109 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1110 charsleft = ch;
1111 bitsleft = 16;
1112 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001113 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001114 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001115 } else {
1116 *out++ = (char) ch;
1117 }
1118 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1120 *out++ = B64(charsleft << (6-bitsleft));
1121 charsleft = 0;
1122 bitsleft = 0;
1123 /* Characters not in the BASE64 set implicitly unshift the sequence
1124 so no '-' is required, except if the character is itself a '-' */
1125 if (B64CHAR(ch) || ch == '-') {
1126 *out++ = '-';
1127 }
1128 inShift = 0;
1129 *out++ = (char) ch;
1130 } else {
1131 bitsleft += 16;
1132 charsleft = (charsleft << 16) | ch;
1133 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1134
1135 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001136 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001137 or '-' then the shift sequence will be terminated implicitly and we
1138 don't have to insert a '-'. */
1139
1140 if (bitsleft == 0) {
1141 if (i + 1 < size) {
1142 Py_UNICODE ch2 = s[i+1];
1143
1144 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001145
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001146 } else if (B64CHAR(ch2) || ch2 == '-') {
1147 *out++ = '-';
1148 inShift = 0;
1149 } else {
1150 inShift = 0;
1151 }
1152
1153 }
1154 else {
1155 *out++ = '-';
1156 inShift = 0;
1157 }
1158 }
Tim Petersced69f82003-09-16 20:30:58 +00001159 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001160 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001161 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001162 if (bitsleft) {
1163 *out++= B64(charsleft << (6-bitsleft) );
1164 *out++ = '-';
1165 }
1166
Tim Peters5de98422002-04-27 18:44:32 +00001167 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001168 return v;
1169}
1170
1171#undef SPECIAL
1172#undef B64
1173#undef B64CHAR
1174#undef UB64
1175#undef ENCODE
1176#undef DECODE
1177
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178/* --- UTF-8 Codec -------------------------------------------------------- */
1179
Tim Petersced69f82003-09-16 20:30:58 +00001180static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181char utf8_code_length[256] = {
1182 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1183 illegal prefix. see RFC 2279 for details */
1184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1200};
1201
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001203 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 const char *errors)
1205{
Walter Dörwald69652032004-09-07 20:24:22 +00001206 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1207}
1208
1209PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001210 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001211 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001212 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001213{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001214 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001216 Py_ssize_t startinpos;
1217 Py_ssize_t endinpos;
1218 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 const char *e;
1220 PyUnicodeObject *unicode;
1221 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001222 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001223 PyObject *errorHandler = NULL;
1224 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
1226 /* Note: size will always be longer than the resulting Unicode
1227 character count */
1228 unicode = _PyUnicode_New(size);
1229 if (!unicode)
1230 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001231 if (size == 0) {
1232 if (consumed)
1233 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236
1237 /* Unpack UTF-8 encoded data */
1238 p = unicode->str;
1239 e = s + size;
1240
1241 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001242 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243
1244 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001245 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 s++;
1247 continue;
1248 }
1249
1250 n = utf8_code_length[ch];
1251
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001252 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001253 if (consumed)
1254 break;
1255 else {
1256 errmsg = "unexpected end of data";
1257 startinpos = s-starts;
1258 endinpos = size;
1259 goto utf8Error;
1260 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262
1263 switch (n) {
1264
1265 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001266 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001267 startinpos = s-starts;
1268 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001269 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001270
1271 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001272 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273 startinpos = s-starts;
1274 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001275 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276
1277 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001278 if ((s[1] & 0xc0) != 0x80) {
1279 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
1283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 errmsg = "illegal encoding";
1289 goto utf8Error;
1290 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001292 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 break;
1294
1295 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001296 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 (s[2] & 0xc0) != 0x80) {
1298 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299 startinpos = s-starts;
1300 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 goto utf8Error;
1302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001304 if (ch < 0x0800) {
1305 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001306 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001307
1308 XXX For wide builds (UCS-4) we should probably try
1309 to recombine the surrogates into a single code
1310 unit.
1311 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001312 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001313 startinpos = s-starts;
1314 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001315 goto utf8Error;
1316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001318 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001319 break;
1320
1321 case 4:
1322 if ((s[1] & 0xc0) != 0x80 ||
1323 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001324 (s[3] & 0xc0) != 0x80) {
1325 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 startinpos = s-starts;
1327 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001328 goto utf8Error;
1329 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001330 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1331 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1332 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001333 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001334 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001335 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001336 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001337 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001338 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 startinpos = s-starts;
1340 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001341 goto utf8Error;
1342 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001343#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001344 *p++ = (Py_UNICODE)ch;
1345#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001346 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001347
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 /* translate from 10000..10FFFF to 0..FFFF */
1349 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001350
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001351 /* high surrogate = top 10 bits added to D800 */
1352 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001353
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001354 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001355 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001356#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 break;
1358
1359 default:
1360 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001361 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001362 startinpos = s-starts;
1363 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001364 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 }
1366 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001367 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001368
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001370 outpos = p-PyUnicode_AS_UNICODE(unicode);
1371 if (unicode_decode_call_errorhandler(
1372 errors, &errorHandler,
1373 "utf8", errmsg,
1374 starts, size, &startinpos, &endinpos, &exc, &s,
1375 (PyObject **)&unicode, &outpos, &p))
1376 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 }
Walter Dörwald69652032004-09-07 20:24:22 +00001378 if (consumed)
1379 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380
1381 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001382 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001383 goto onError;
1384
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001385 Py_XDECREF(errorHandler);
1386 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 return (PyObject *)unicode;
1388
1389onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 Py_XDECREF(errorHandler);
1391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 Py_DECREF(unicode);
1393 return NULL;
1394}
1395
Tim Peters602f7402002-04-27 18:03:26 +00001396/* Allocation strategy: if the string is short, convert into a stack buffer
1397 and allocate exactly as much space needed at the end. Else allocate the
1398 maximum possible needed (4 result bytes per Unicode character), and return
1399 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001400*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001401PyObject *
1402PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001403 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001404 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405{
Tim Peters602f7402002-04-27 18:03:26 +00001406#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001407
Martin v. Löwis18e16552006-02-15 17:27:45 +00001408 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001409 PyObject *v; /* result string object */
1410 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001412 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001413 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001414
Tim Peters602f7402002-04-27 18:03:26 +00001415 assert(s != NULL);
1416 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417
Tim Peters602f7402002-04-27 18:03:26 +00001418 if (size <= MAX_SHORT_UNICHARS) {
1419 /* Write into the stack buffer; nallocated can't overflow.
1420 * At the end, we'll allocate exactly as much heap space as it
1421 * turns out we need.
1422 */
1423 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1424 v = NULL; /* will allocate after we're done */
1425 p = stackbuf;
1426 }
1427 else {
1428 /* Overallocate on the heap, and give the excess back at the end. */
1429 nallocated = size * 4;
1430 if (nallocated / 4 != size) /* overflow! */
1431 return PyErr_NoMemory();
1432 v = PyString_FromStringAndSize(NULL, nallocated);
1433 if (v == NULL)
1434 return NULL;
1435 p = PyString_AS_STRING(v);
1436 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001437
Tim Peters602f7402002-04-27 18:03:26 +00001438 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001439 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001440
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001441 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001442 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001444
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001446 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001447 *p++ = (char)(0xc0 | (ch >> 6));
1448 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001449 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001450 else {
Tim Peters602f7402002-04-27 18:03:26 +00001451 /* Encode UCS2 Unicode ordinals */
1452 if (ch < 0x10000) {
1453 /* Special case: check for high surrogate */
1454 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1455 Py_UCS4 ch2 = s[i];
1456 /* Check for low surrogate and combine the two to
1457 form a UCS4 value */
1458 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001459 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001460 i++;
1461 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001462 }
Tim Peters602f7402002-04-27 18:03:26 +00001463 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001464 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001465 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001466 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1467 *p++ = (char)(0x80 | (ch & 0x3f));
1468 continue;
1469 }
1470encodeUCS4:
1471 /* Encode UCS4 Unicode ordinals */
1472 *p++ = (char)(0xf0 | (ch >> 18));
1473 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1474 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1475 *p++ = (char)(0x80 | (ch & 0x3f));
1476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001478
Tim Peters602f7402002-04-27 18:03:26 +00001479 if (v == NULL) {
1480 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001481 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001482 assert(nneeded <= nallocated);
1483 v = PyString_FromStringAndSize(stackbuf, nneeded);
1484 }
1485 else {
1486 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001487 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001488 assert(nneeded <= nallocated);
1489 _PyString_Resize(&v, nneeded);
1490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001492
Tim Peters602f7402002-04-27 18:03:26 +00001493#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494}
1495
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1497{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
1500 return NULL;
1501 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001502 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1503 PyUnicode_GET_SIZE(unicode),
1504 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505}
1506
1507/* --- UTF-16 Codec ------------------------------------------------------- */
1508
Tim Peters772747b2001-08-09 22:21:55 +00001509PyObject *
1510PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001511 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001512 const char *errors,
1513 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001514{
Walter Dörwald69652032004-09-07 20:24:22 +00001515 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1516}
1517
1518PyObject *
1519PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001520 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001521 const char *errors,
1522 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001523 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001524{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001526 Py_ssize_t startinpos;
1527 Py_ssize_t endinpos;
1528 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529 PyUnicodeObject *unicode;
1530 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001531 const unsigned char *q, *e;
1532 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001533 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001534 /* Offsets from q for retrieving byte pairs in the right order. */
1535#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536 int ihi = 1, ilo = 0;
1537#else
1538 int ihi = 0, ilo = 1;
1539#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 PyObject *errorHandler = NULL;
1541 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542
1543 /* Note: size will always be longer than the resulting Unicode
1544 character count */
1545 unicode = _PyUnicode_New(size);
1546 if (!unicode)
1547 return NULL;
1548 if (size == 0)
1549 return (PyObject *)unicode;
1550
1551 /* Unpack UTF-16 encoded data */
1552 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001553 q = (unsigned char *)s;
1554 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555
1556 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001557 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001559 /* Check for BOM marks (U+FEFF) in the input and adjust current
1560 byte order setting accordingly. In native mode, the leading BOM
1561 mark is skipped, in all other modes, it is copied to the output
1562 stream as-is (giving a ZWNBSP character). */
1563 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001564 if (size >= 2) {
1565 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001566#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001567 if (bom == 0xFEFF) {
1568 q += 2;
1569 bo = -1;
1570 }
1571 else if (bom == 0xFFFE) {
1572 q += 2;
1573 bo = 1;
1574 }
Tim Petersced69f82003-09-16 20:30:58 +00001575#else
Walter Dörwald69652032004-09-07 20:24:22 +00001576 if (bom == 0xFEFF) {
1577 q += 2;
1578 bo = 1;
1579 }
1580 else if (bom == 0xFFFE) {
1581 q += 2;
1582 bo = -1;
1583 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001584#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001585 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587
Tim Peters772747b2001-08-09 22:21:55 +00001588 if (bo == -1) {
1589 /* force LE */
1590 ihi = 1;
1591 ilo = 0;
1592 }
1593 else if (bo == 1) {
1594 /* force BE */
1595 ihi = 0;
1596 ilo = 1;
1597 }
1598
1599 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001601 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001602 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001603 if (consumed)
1604 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001605 errmsg = "truncated data";
1606 startinpos = ((const char *)q)-starts;
1607 endinpos = ((const char *)e)-starts;
1608 goto utf16Error;
1609 /* The remaining input chars are ignored if the callback
1610 chooses to skip the input */
1611 }
1612 ch = (q[ihi] << 8) | q[ilo];
1613
Tim Peters772747b2001-08-09 22:21:55 +00001614 q += 2;
1615
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 if (ch < 0xD800 || ch > 0xDFFF) {
1617 *p++ = ch;
1618 continue;
1619 }
1620
1621 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001622 if (q >= e) {
1623 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001624 startinpos = (((const char *)q)-2)-starts;
1625 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001626 goto utf16Error;
1627 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001628 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001629 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1630 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001631 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001632#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001633 *p++ = ch;
1634 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001635#else
1636 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001637#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001638 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001639 }
1640 else {
1641 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 startinpos = (((const char *)q)-4)-starts;
1643 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001644 goto utf16Error;
1645 }
1646
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001648 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 startinpos = (((const char *)q)-2)-starts;
1650 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001651 /* Fall through to report the error */
1652
1653 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001654 outpos = p-PyUnicode_AS_UNICODE(unicode);
1655 if (unicode_decode_call_errorhandler(
1656 errors, &errorHandler,
1657 "utf16", errmsg,
1658 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1659 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 }
1662
1663 if (byteorder)
1664 *byteorder = bo;
1665
Walter Dörwald69652032004-09-07 20:24:22 +00001666 if (consumed)
1667 *consumed = (const char *)q-starts;
1668
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001670 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671 goto onError;
1672
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001673 Py_XDECREF(errorHandler);
1674 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675 return (PyObject *)unicode;
1676
1677onError:
1678 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001679 Py_XDECREF(errorHandler);
1680 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 return NULL;
1682}
1683
Tim Peters772747b2001-08-09 22:21:55 +00001684PyObject *
1685PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001686 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001687 const char *errors,
1688 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001689{
1690 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001691 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001692#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001693 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001694#else
1695 const int pairs = 0;
1696#endif
Tim Peters772747b2001-08-09 22:21:55 +00001697 /* Offsets from p for storing byte pairs in the right order. */
1698#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699 int ihi = 1, ilo = 0;
1700#else
1701 int ihi = 0, ilo = 1;
1702#endif
1703
1704#define STORECHAR(CH) \
1705 do { \
1706 p[ihi] = ((CH) >> 8) & 0xff; \
1707 p[ilo] = (CH) & 0xff; \
1708 p += 2; \
1709 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001711#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001712 for (i = pairs = 0; i < size; i++)
1713 if (s[i] >= 0x10000)
1714 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001715#endif
Tim Petersced69f82003-09-16 20:30:58 +00001716 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001717 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718 if (v == NULL)
1719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720
Tim Peters772747b2001-08-09 22:21:55 +00001721 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001723 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001724 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001725 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001726
1727 if (byteorder == -1) {
1728 /* force LE */
1729 ihi = 1;
1730 ilo = 0;
1731 }
1732 else if (byteorder == 1) {
1733 /* force BE */
1734 ihi = 0;
1735 ilo = 1;
1736 }
1737
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001738 while (size-- > 0) {
1739 Py_UNICODE ch = *s++;
1740 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001741#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001742 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001743 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1744 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001746#endif
Tim Peters772747b2001-08-09 22:21:55 +00001747 STORECHAR(ch);
1748 if (ch2)
1749 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001752#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753}
1754
1755PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1756{
1757 if (!PyUnicode_Check(unicode)) {
1758 PyErr_BadArgument();
1759 return NULL;
1760 }
1761 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1762 PyUnicode_GET_SIZE(unicode),
1763 NULL,
1764 0);
1765}
1766
1767/* --- Unicode Escape Codec ----------------------------------------------- */
1768
Fredrik Lundh06d12682001-01-24 07:59:11 +00001769static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001770
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001772 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773 const char *errors)
1774{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001776 Py_ssize_t startinpos;
1777 Py_ssize_t endinpos;
1778 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001781 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001783 char* message;
1784 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001785 PyObject *errorHandler = NULL;
1786 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001787
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 /* Escaped strings will always be longer than the resulting
1789 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 length after conversion to the true value.
1791 (but if the error callback returns a long replacement string
1792 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 v = _PyUnicode_New(size);
1794 if (v == NULL)
1795 goto onError;
1796 if (size == 0)
1797 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001798
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001799 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001801
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 while (s < end) {
1803 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001804 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001805 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806
1807 /* Non-escape characters are interpreted as Unicode ordinals */
1808 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001809 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 continue;
1811 }
1812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001813 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 /* \ - Escapes */
1815 s++;
1816 switch (*s++) {
1817
1818 /* \x escapes */
1819 case '\n': break;
1820 case '\\': *p++ = '\\'; break;
1821 case '\'': *p++ = '\''; break;
1822 case '\"': *p++ = '\"'; break;
1823 case 'b': *p++ = '\b'; break;
1824 case 'f': *p++ = '\014'; break; /* FF */
1825 case 't': *p++ = '\t'; break;
1826 case 'n': *p++ = '\n'; break;
1827 case 'r': *p++ = '\r'; break;
1828 case 'v': *p++ = '\013'; break; /* VT */
1829 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1830
1831 /* \OOO (octal) escapes */
1832 case '0': case '1': case '2': case '3':
1833 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001834 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001836 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001838 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001840 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 break;
1842
Fredrik Lundhccc74732001-02-18 22:13:49 +00001843 /* hex escapes */
1844 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001846 digits = 2;
1847 message = "truncated \\xXX escape";
1848 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849
Fredrik Lundhccc74732001-02-18 22:13:49 +00001850 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001852 digits = 4;
1853 message = "truncated \\uXXXX escape";
1854 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001855
Fredrik Lundhccc74732001-02-18 22:13:49 +00001856 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001857 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001858 digits = 8;
1859 message = "truncated \\UXXXXXXXX escape";
1860 hexescape:
1861 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 outpos = p-PyUnicode_AS_UNICODE(v);
1863 if (s+digits>end) {
1864 endinpos = size;
1865 if (unicode_decode_call_errorhandler(
1866 errors, &errorHandler,
1867 "unicodeescape", "end of string in escape sequence",
1868 starts, size, &startinpos, &endinpos, &exc, &s,
1869 (PyObject **)&v, &outpos, &p))
1870 goto onError;
1871 goto nextByte;
1872 }
1873 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001874 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001875 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001876 endinpos = (s+i+1)-starts;
1877 if (unicode_decode_call_errorhandler(
1878 errors, &errorHandler,
1879 "unicodeescape", message,
1880 starts, size, &startinpos, &endinpos, &exc, &s,
1881 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001882 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001883 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001884 }
1885 chr = (chr<<4) & ~0xF;
1886 if (c >= '0' && c <= '9')
1887 chr += c - '0';
1888 else if (c >= 'a' && c <= 'f')
1889 chr += 10 + c - 'a';
1890 else
1891 chr += 10 + c - 'A';
1892 }
1893 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001894 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001895 /* _decoding_error will have already written into the
1896 target buffer. */
1897 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001898 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001899 /* when we get here, chr is a 32-bit unicode character */
1900 if (chr <= 0xffff)
1901 /* UCS-2 character */
1902 *p++ = (Py_UNICODE) chr;
1903 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001904 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001905 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001906#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001907 *p++ = chr;
1908#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001909 chr -= 0x10000L;
1910 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001911 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001913 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001914 endinpos = s-starts;
1915 outpos = p-PyUnicode_AS_UNICODE(v);
1916 if (unicode_decode_call_errorhandler(
1917 errors, &errorHandler,
1918 "unicodeescape", "illegal Unicode character",
1919 starts, size, &startinpos, &endinpos, &exc, &s,
1920 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001921 goto onError;
1922 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001923 break;
1924
1925 /* \N{name} */
1926 case 'N':
1927 message = "malformed \\N character escape";
1928 if (ucnhash_CAPI == NULL) {
1929 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001930 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001931 m = PyImport_ImportModule("unicodedata");
1932 if (m == NULL)
1933 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001934 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001935 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001936 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001937 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001938 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001939 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001940 if (ucnhash_CAPI == NULL)
1941 goto ucnhashError;
1942 }
1943 if (*s == '{') {
1944 const char *start = s+1;
1945 /* look for the closing brace */
1946 while (*s != '}' && s < end)
1947 s++;
1948 if (s > start && s < end && *s == '}') {
1949 /* found a name. look it up in the unicode database */
1950 message = "unknown Unicode character name";
1951 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001952 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001953 goto store;
1954 }
1955 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001956 endinpos = s-starts;
1957 outpos = p-PyUnicode_AS_UNICODE(v);
1958 if (unicode_decode_call_errorhandler(
1959 errors, &errorHandler,
1960 "unicodeescape", message,
1961 starts, size, &startinpos, &endinpos, &exc, &s,
1962 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001963 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001964 break;
1965
1966 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001967 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001968 message = "\\ at end of string";
1969 s--;
1970 endinpos = s-starts;
1971 outpos = p-PyUnicode_AS_UNICODE(v);
1972 if (unicode_decode_call_errorhandler(
1973 errors, &errorHandler,
1974 "unicodeescape", message,
1975 starts, size, &startinpos, &endinpos, &exc, &s,
1976 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001977 goto onError;
1978 }
1979 else {
1980 *p++ = '\\';
1981 *p++ = (unsigned char)s[-1];
1982 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001983 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001985 nextByte:
1986 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001988 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001989 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001990 Py_XDECREF(errorHandler);
1991 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001993
Fredrik Lundhccc74732001-02-18 22:13:49 +00001994ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001995 PyErr_SetString(
1996 PyExc_UnicodeError,
1997 "\\N escapes not supported (can't load unicodedata module)"
1998 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001999 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002000 Py_XDECREF(errorHandler);
2001 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002002 return NULL;
2003
Fredrik Lundhccc74732001-02-18 22:13:49 +00002004onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002006 Py_XDECREF(errorHandler);
2007 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 return NULL;
2009}
2010
2011/* Return a Unicode-Escape string version of the Unicode object.
2012
2013 If quotes is true, the string is enclosed in u"" or u'' quotes as
2014 appropriate.
2015
2016*/
2017
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002018Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002019 Py_ssize_t size,
2020 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002021{
2022 /* like wcschr, but doesn't stop at NULL characters */
2023
2024 while (size-- > 0) {
2025 if (*s == ch)
2026 return s;
2027 s++;
2028 }
2029
2030 return NULL;
2031}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002032
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033static
2034PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002035 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 int quotes)
2037{
2038 PyObject *repr;
2039 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002041 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042
2043 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
2044 if (repr == NULL)
2045 return NULL;
2046
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002047 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048
2049 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002051 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052 !findchar(s, size, '"')) ? '"' : '\'';
2053 }
2054 while (size-- > 0) {
2055 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002056
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002057 /* Escape quotes and backslashes */
2058 if ((quotes &&
2059 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 *p++ = '\\';
2061 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002062 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002063 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002064
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002065#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002066 /* Map 21-bit characters to '\U00xxxxxx' */
2067 else if (ch >= 0x10000) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00002068 Py_ssize_t offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002069
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002070 /* Resize the string if necessary */
2071 if (offset + 12 > PyString_GET_SIZE(repr)) {
2072 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002073 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002074 p = PyString_AS_STRING(repr) + offset;
2075 }
2076
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002077 *p++ = '\\';
2078 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002079 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2080 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2081 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2082 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2083 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2084 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2085 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002086 *p++ = hexdigit[ch & 0x0000000F];
2087 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002088 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002089#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002090 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2091 else if (ch >= 0xD800 && ch < 0xDC00) {
2092 Py_UNICODE ch2;
2093 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002094
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002095 ch2 = *s++;
2096 size--;
2097 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2098 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2099 *p++ = '\\';
2100 *p++ = 'U';
2101 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2102 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2103 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2104 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2105 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2106 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2107 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2108 *p++ = hexdigit[ucs & 0x0000000F];
2109 continue;
2110 }
2111 /* Fall through: isolated surrogates are copied as-is */
2112 s--;
2113 size++;
2114 }
2115
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002117 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118 *p++ = '\\';
2119 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002120 *p++ = hexdigit[(ch >> 12) & 0x000F];
2121 *p++ = hexdigit[(ch >> 8) & 0x000F];
2122 *p++ = hexdigit[(ch >> 4) & 0x000F];
2123 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002125
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002126 /* Map special whitespace to '\t', \n', '\r' */
2127 else if (ch == '\t') {
2128 *p++ = '\\';
2129 *p++ = 't';
2130 }
2131 else if (ch == '\n') {
2132 *p++ = '\\';
2133 *p++ = 'n';
2134 }
2135 else if (ch == '\r') {
2136 *p++ = '\\';
2137 *p++ = 'r';
2138 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002139
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002140 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002141 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002143 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002144 *p++ = hexdigit[(ch >> 4) & 0x000F];
2145 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002146 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002147
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 /* Copy everything else as-is */
2149 else
2150 *p++ = (char) ch;
2151 }
2152 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002153 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154
2155 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002156 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002157 return repr;
2158}
2159
2160PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002161 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162{
2163 return unicodeescape_string(s, size, 0);
2164}
2165
2166PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2167{
2168 if (!PyUnicode_Check(unicode)) {
2169 PyErr_BadArgument();
2170 return NULL;
2171 }
2172 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2173 PyUnicode_GET_SIZE(unicode));
2174}
2175
2176/* --- Raw Unicode Escape Codec ------------------------------------------- */
2177
2178PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002179 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 const char *errors)
2181{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002182 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002183 Py_ssize_t startinpos;
2184 Py_ssize_t endinpos;
2185 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002187 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188 const char *end;
2189 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002190 PyObject *errorHandler = NULL;
2191 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002192
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193 /* Escaped strings will always be longer than the resulting
2194 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002195 length after conversion to the true value. (But decoding error
2196 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 v = _PyUnicode_New(size);
2198 if (v == NULL)
2199 goto onError;
2200 if (size == 0)
2201 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002202 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203 end = s + size;
2204 while (s < end) {
2205 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002206 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002208 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209
2210 /* Non-escape characters are interpreted as Unicode ordinals */
2211 if (*s != '\\') {
2212 *p++ = (unsigned char)*s++;
2213 continue;
2214 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002215 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216
2217 /* \u-escapes are only interpreted iff the number of leading
2218 backslashes if odd */
2219 bs = s;
2220 for (;s < end;) {
2221 if (*s != '\\')
2222 break;
2223 *p++ = (unsigned char)*s++;
2224 }
2225 if (((s - bs) & 1) == 0 ||
2226 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002227 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 continue;
2229 }
2230 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002231 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 s++;
2233
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002234 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002235 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002236 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002237 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002239 endinpos = s-starts;
2240 if (unicode_decode_call_errorhandler(
2241 errors, &errorHandler,
2242 "rawunicodeescape", "truncated \\uXXXX",
2243 starts, size, &startinpos, &endinpos, &exc, &s,
2244 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002246 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247 }
2248 x = (x<<4) & ~0xF;
2249 if (c >= '0' && c <= '9')
2250 x += c - '0';
2251 else if (c >= 'a' && c <= 'f')
2252 x += 10 + c - 'a';
2253 else
2254 x += 10 + c - 'A';
2255 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002256#ifndef Py_UNICODE_WIDE
2257 if (x > 0x10000) {
2258 if (unicode_decode_call_errorhandler(
2259 errors, &errorHandler,
2260 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2261 starts, size, &startinpos, &endinpos, &exc, &s,
2262 (PyObject **)&v, &outpos, &p))
2263 goto onError;
2264 }
2265#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266 *p++ = x;
2267 nextByte:
2268 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002270 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002271 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002272 Py_XDECREF(errorHandler);
2273 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002275
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 onError:
2277 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002278 Py_XDECREF(errorHandler);
2279 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002280 return NULL;
2281}
2282
2283PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002284 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285{
2286 PyObject *repr;
2287 char *p;
2288 char *q;
2289
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002290 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002292#ifdef Py_UNICODE_WIDE
2293 repr = PyString_FromStringAndSize(NULL, 10 * size);
2294#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002296#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297 if (repr == NULL)
2298 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002299 if (size == 0)
2300 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301
2302 p = q = PyString_AS_STRING(repr);
2303 while (size-- > 0) {
2304 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002305#ifdef Py_UNICODE_WIDE
2306 /* Map 32-bit characters to '\Uxxxxxxxx' */
2307 if (ch >= 0x10000) {
2308 *p++ = '\\';
2309 *p++ = 'U';
2310 *p++ = hexdigit[(ch >> 28) & 0xf];
2311 *p++ = hexdigit[(ch >> 24) & 0xf];
2312 *p++ = hexdigit[(ch >> 20) & 0xf];
2313 *p++ = hexdigit[(ch >> 16) & 0xf];
2314 *p++ = hexdigit[(ch >> 12) & 0xf];
2315 *p++ = hexdigit[(ch >> 8) & 0xf];
2316 *p++ = hexdigit[(ch >> 4) & 0xf];
2317 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002318 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002319 else
2320#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002321 /* Map 16-bit characters to '\uxxxx' */
2322 if (ch >= 256) {
2323 *p++ = '\\';
2324 *p++ = 'u';
2325 *p++ = hexdigit[(ch >> 12) & 0xf];
2326 *p++ = hexdigit[(ch >> 8) & 0xf];
2327 *p++ = hexdigit[(ch >> 4) & 0xf];
2328 *p++ = hexdigit[ch & 15];
2329 }
2330 /* Copy everything else as-is */
2331 else
2332 *p++ = (char) ch;
2333 }
2334 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002335 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002336 return repr;
2337}
2338
2339PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2340{
2341 if (!PyUnicode_Check(unicode)) {
2342 PyErr_BadArgument();
2343 return NULL;
2344 }
2345 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2346 PyUnicode_GET_SIZE(unicode));
2347}
2348
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002349/* --- Unicode Internal Codec ------------------------------------------- */
2350
2351PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002352 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002353 const char *errors)
2354{
2355 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002356 Py_ssize_t startinpos;
2357 Py_ssize_t endinpos;
2358 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002359 PyUnicodeObject *v;
2360 Py_UNICODE *p;
2361 const char *end;
2362 const char *reason;
2363 PyObject *errorHandler = NULL;
2364 PyObject *exc = NULL;
2365
Neal Norwitzd43069c2006-01-08 01:12:10 +00002366#ifdef Py_UNICODE_WIDE
2367 Py_UNICODE unimax = PyUnicode_GetMax();
2368#endif
2369
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002370 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2371 if (v == NULL)
2372 goto onError;
2373 if (PyUnicode_GetSize((PyObject *)v) == 0)
2374 return (PyObject *)v;
2375 p = PyUnicode_AS_UNICODE(v);
2376 end = s + size;
2377
2378 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002379 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002380 /* We have to sanity check the raw data, otherwise doom looms for
2381 some malformed UCS-4 data. */
2382 if (
2383 #ifdef Py_UNICODE_WIDE
2384 *p > unimax || *p < 0 ||
2385 #endif
2386 end-s < Py_UNICODE_SIZE
2387 )
2388 {
2389 startinpos = s - starts;
2390 if (end-s < Py_UNICODE_SIZE) {
2391 endinpos = end-starts;
2392 reason = "truncated input";
2393 }
2394 else {
2395 endinpos = s - starts + Py_UNICODE_SIZE;
2396 reason = "illegal code point (> 0x10FFFF)";
2397 }
2398 outpos = p - PyUnicode_AS_UNICODE(v);
2399 if (unicode_decode_call_errorhandler(
2400 errors, &errorHandler,
2401 "unicode_internal", reason,
2402 starts, size, &startinpos, &endinpos, &exc, &s,
2403 (PyObject **)&v, &outpos, &p)) {
2404 goto onError;
2405 }
2406 }
2407 else {
2408 p++;
2409 s += Py_UNICODE_SIZE;
2410 }
2411 }
2412
Martin v. Löwis412fb672006-04-13 06:34:32 +00002413 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002414 goto onError;
2415 Py_XDECREF(errorHandler);
2416 Py_XDECREF(exc);
2417 return (PyObject *)v;
2418
2419 onError:
2420 Py_XDECREF(v);
2421 Py_XDECREF(errorHandler);
2422 Py_XDECREF(exc);
2423 return NULL;
2424}
2425
Guido van Rossumd57fd912000-03-10 22:53:23 +00002426/* --- Latin-1 Codec ------------------------------------------------------ */
2427
2428PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002429 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002430 const char *errors)
2431{
2432 PyUnicodeObject *v;
2433 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002434
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002436 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002437 Py_UNICODE r = *(unsigned char*)s;
2438 return PyUnicode_FromUnicode(&r, 1);
2439 }
2440
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441 v = _PyUnicode_New(size);
2442 if (v == NULL)
2443 goto onError;
2444 if (size == 0)
2445 return (PyObject *)v;
2446 p = PyUnicode_AS_UNICODE(v);
2447 while (size-- > 0)
2448 *p++ = (unsigned char)*s++;
2449 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002450
Guido van Rossumd57fd912000-03-10 22:53:23 +00002451 onError:
2452 Py_XDECREF(v);
2453 return NULL;
2454}
2455
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002456/* create or adjust a UnicodeEncodeError */
2457static void make_encode_exception(PyObject **exceptionObject,
2458 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002459 const Py_UNICODE *unicode, Py_ssize_t size,
2460 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002461 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002462{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002463 if (*exceptionObject == NULL) {
2464 *exceptionObject = PyUnicodeEncodeError_Create(
2465 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466 }
2467 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002468 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2469 goto onError;
2470 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2471 goto onError;
2472 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2473 goto onError;
2474 return;
2475 onError:
2476 Py_DECREF(*exceptionObject);
2477 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 }
2479}
2480
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002481/* raises a UnicodeEncodeError */
2482static void raise_encode_exception(PyObject **exceptionObject,
2483 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002484 const Py_UNICODE *unicode, Py_ssize_t size,
2485 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002486 const char *reason)
2487{
2488 make_encode_exception(exceptionObject,
2489 encoding, unicode, size, startpos, endpos, reason);
2490 if (*exceptionObject != NULL)
2491 PyCodec_StrictErrors(*exceptionObject);
2492}
2493
2494/* error handling callback helper:
2495 build arguments, call the callback and check the arguments,
2496 put the result into newpos and return the replacement string, which
2497 has to be freed by the caller */
2498static PyObject *unicode_encode_call_errorhandler(const char *errors,
2499 PyObject **errorHandler,
2500 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002501 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2502 Py_ssize_t startpos, Py_ssize_t endpos,
2503 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002504{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002505 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002506
2507 PyObject *restuple;
2508 PyObject *resunicode;
2509
2510 if (*errorHandler == NULL) {
2511 *errorHandler = PyCodec_LookupError(errors);
2512 if (*errorHandler == NULL)
2513 return NULL;
2514 }
2515
2516 make_encode_exception(exceptionObject,
2517 encoding, unicode, size, startpos, endpos, reason);
2518 if (*exceptionObject == NULL)
2519 return NULL;
2520
2521 restuple = PyObject_CallFunctionObjArgs(
2522 *errorHandler, *exceptionObject, NULL);
2523 if (restuple == NULL)
2524 return NULL;
2525 if (!PyTuple_Check(restuple)) {
2526 PyErr_Format(PyExc_TypeError, &argparse[4]);
2527 Py_DECREF(restuple);
2528 return NULL;
2529 }
2530 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2531 &resunicode, newpos)) {
2532 Py_DECREF(restuple);
2533 return NULL;
2534 }
2535 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002536 *newpos = size+*newpos;
2537 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002538 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002539 Py_DECREF(restuple);
2540 return NULL;
2541 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002542 Py_INCREF(resunicode);
2543 Py_DECREF(restuple);
2544 return resunicode;
2545}
2546
2547static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002548 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002549 const char *errors,
2550 int limit)
2551{
2552 /* output object */
2553 PyObject *res;
2554 /* pointers to the beginning and end+1 of input */
2555 const Py_UNICODE *startp = p;
2556 const Py_UNICODE *endp = p + size;
2557 /* pointer to the beginning of the unencodable characters */
2558 /* const Py_UNICODE *badp = NULL; */
2559 /* pointer into the output */
2560 char *str;
2561 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002562 Py_ssize_t respos = 0;
2563 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002564 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2565 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002566 PyObject *errorHandler = NULL;
2567 PyObject *exc = NULL;
2568 /* the following variable is used for caching string comparisons
2569 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2570 int known_errorHandler = -1;
2571
2572 /* allocate enough for a simple encoding without
2573 replacements, if we need more, we'll resize */
2574 res = PyString_FromStringAndSize(NULL, size);
2575 if (res == NULL)
2576 goto onError;
2577 if (size == 0)
2578 return res;
2579 str = PyString_AS_STRING(res);
2580 ressize = size;
2581
2582 while (p<endp) {
2583 Py_UNICODE c = *p;
2584
2585 /* can we encode this? */
2586 if (c<limit) {
2587 /* no overflow check, because we know that the space is enough */
2588 *str++ = (char)c;
2589 ++p;
2590 }
2591 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002592 Py_ssize_t unicodepos = p-startp;
2593 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002594 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002595 Py_ssize_t repsize;
2596 Py_ssize_t newpos;
2597 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002598 Py_UNICODE *uni2;
2599 /* startpos for collecting unencodable chars */
2600 const Py_UNICODE *collstart = p;
2601 const Py_UNICODE *collend = p;
2602 /* find all unecodable characters */
2603 while ((collend < endp) && ((*collend)>=limit))
2604 ++collend;
2605 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2606 if (known_errorHandler==-1) {
2607 if ((errors==NULL) || (!strcmp(errors, "strict")))
2608 known_errorHandler = 1;
2609 else if (!strcmp(errors, "replace"))
2610 known_errorHandler = 2;
2611 else if (!strcmp(errors, "ignore"))
2612 known_errorHandler = 3;
2613 else if (!strcmp(errors, "xmlcharrefreplace"))
2614 known_errorHandler = 4;
2615 else
2616 known_errorHandler = 0;
2617 }
2618 switch (known_errorHandler) {
2619 case 1: /* strict */
2620 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2621 goto onError;
2622 case 2: /* replace */
2623 while (collstart++<collend)
2624 *str++ = '?'; /* fall through */
2625 case 3: /* ignore */
2626 p = collend;
2627 break;
2628 case 4: /* xmlcharrefreplace */
2629 respos = str-PyString_AS_STRING(res);
2630 /* determine replacement size (temporarily (mis)uses p) */
2631 for (p = collstart, repsize = 0; p < collend; ++p) {
2632 if (*p<10)
2633 repsize += 2+1+1;
2634 else if (*p<100)
2635 repsize += 2+2+1;
2636 else if (*p<1000)
2637 repsize += 2+3+1;
2638 else if (*p<10000)
2639 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002640#ifndef Py_UNICODE_WIDE
2641 else
2642 repsize += 2+5+1;
2643#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002644 else if (*p<100000)
2645 repsize += 2+5+1;
2646 else if (*p<1000000)
2647 repsize += 2+6+1;
2648 else
2649 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002650#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002651 }
2652 requiredsize = respos+repsize+(endp-collend);
2653 if (requiredsize > ressize) {
2654 if (requiredsize<2*ressize)
2655 requiredsize = 2*ressize;
2656 if (_PyString_Resize(&res, requiredsize))
2657 goto onError;
2658 str = PyString_AS_STRING(res) + respos;
2659 ressize = requiredsize;
2660 }
2661 /* generate replacement (temporarily (mis)uses p) */
2662 for (p = collstart; p < collend; ++p) {
2663 str += sprintf(str, "&#%d;", (int)*p);
2664 }
2665 p = collend;
2666 break;
2667 default:
2668 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2669 encoding, reason, startp, size, &exc,
2670 collstart-startp, collend-startp, &newpos);
2671 if (repunicode == NULL)
2672 goto onError;
2673 /* need more space? (at least enough for what we
2674 have+the replacement+the rest of the string, so
2675 we won't have to check space for encodable characters) */
2676 respos = str-PyString_AS_STRING(res);
2677 repsize = PyUnicode_GET_SIZE(repunicode);
2678 requiredsize = respos+repsize+(endp-collend);
2679 if (requiredsize > ressize) {
2680 if (requiredsize<2*ressize)
2681 requiredsize = 2*ressize;
2682 if (_PyString_Resize(&res, requiredsize)) {
2683 Py_DECREF(repunicode);
2684 goto onError;
2685 }
2686 str = PyString_AS_STRING(res) + respos;
2687 ressize = requiredsize;
2688 }
2689 /* check if there is anything unencodable in the replacement
2690 and copy it to the output */
2691 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2692 c = *uni2;
2693 if (c >= limit) {
2694 raise_encode_exception(&exc, encoding, startp, size,
2695 unicodepos, unicodepos+1, reason);
2696 Py_DECREF(repunicode);
2697 goto onError;
2698 }
2699 *str = (char)c;
2700 }
2701 p = startp + newpos;
2702 Py_DECREF(repunicode);
2703 }
2704 }
2705 }
2706 /* Resize if we allocated to much */
2707 respos = str-PyString_AS_STRING(res);
2708 if (respos<ressize)
2709 /* If this falls res will be NULL */
2710 _PyString_Resize(&res, respos);
2711 Py_XDECREF(errorHandler);
2712 Py_XDECREF(exc);
2713 return res;
2714
2715 onError:
2716 Py_XDECREF(res);
2717 Py_XDECREF(errorHandler);
2718 Py_XDECREF(exc);
2719 return NULL;
2720}
2721
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002723 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002724 const char *errors)
2725{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002726 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727}
2728
2729PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2730{
2731 if (!PyUnicode_Check(unicode)) {
2732 PyErr_BadArgument();
2733 return NULL;
2734 }
2735 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2736 PyUnicode_GET_SIZE(unicode),
2737 NULL);
2738}
2739
2740/* --- 7-bit ASCII Codec -------------------------------------------------- */
2741
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002743 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744 const char *errors)
2745{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747 PyUnicodeObject *v;
2748 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002749 Py_ssize_t startinpos;
2750 Py_ssize_t endinpos;
2751 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002752 const char *e;
2753 PyObject *errorHandler = NULL;
2754 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002755
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002757 if (size == 1 && *(unsigned char*)s < 128) {
2758 Py_UNICODE r = *(unsigned char*)s;
2759 return PyUnicode_FromUnicode(&r, 1);
2760 }
Tim Petersced69f82003-09-16 20:30:58 +00002761
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 v = _PyUnicode_New(size);
2763 if (v == NULL)
2764 goto onError;
2765 if (size == 0)
2766 return (PyObject *)v;
2767 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002768 e = s + size;
2769 while (s < e) {
2770 register unsigned char c = (unsigned char)*s;
2771 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002773 ++s;
2774 }
2775 else {
2776 startinpos = s-starts;
2777 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002778 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002779 if (unicode_decode_call_errorhandler(
2780 errors, &errorHandler,
2781 "ascii", "ordinal not in range(128)",
2782 starts, size, &startinpos, &endinpos, &exc, &s,
2783 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002787 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002788 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002789 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002790 Py_XDECREF(errorHandler);
2791 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002793
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794 onError:
2795 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002796 Py_XDECREF(errorHandler);
2797 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 return NULL;
2799}
2800
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002802 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 const char *errors)
2804{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002805 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806}
2807
2808PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2809{
2810 if (!PyUnicode_Check(unicode)) {
2811 PyErr_BadArgument();
2812 return NULL;
2813 }
2814 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2815 PyUnicode_GET_SIZE(unicode),
2816 NULL);
2817}
2818
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002819#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002820
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002821/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002822
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002823PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002824 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002825 const char *errors)
2826{
2827 PyUnicodeObject *v;
2828 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002829 DWORD usize;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002830
2831 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002832 assert(size < INT_MAX);
2833 usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002834 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002835 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2836
2837 v = _PyUnicode_New(usize);
2838 if (v == NULL)
2839 return NULL;
2840 if (usize == 0)
2841 return (PyObject *)v;
2842 p = PyUnicode_AS_UNICODE(v);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002843 if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002844 Py_DECREF(v);
2845 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2846 }
2847
2848 return (PyObject *)v;
2849}
2850
2851PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002852 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002853 const char *errors)
2854{
2855 PyObject *repr;
2856 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002857 DWORD mbcssize;
2858
2859 /* If there are no characters, bail now! */
2860 if (size==0)
2861 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002862
2863 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002864 assert(size<INT_MAX);
2865 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002866 if (mbcssize==0)
2867 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2868
2869 repr = PyString_FromStringAndSize(NULL, mbcssize);
2870 if (repr == NULL)
2871 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002872 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002873 return repr;
2874
2875 /* Do the conversion */
2876 s = PyString_AS_STRING(repr);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002877 assert(size < INT_MAX);
2878 if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002879 Py_DECREF(repr);
2880 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2881 }
2882 return repr;
2883}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002884
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002885PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2886{
2887 if (!PyUnicode_Check(unicode)) {
2888 PyErr_BadArgument();
2889 return NULL;
2890 }
2891 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2892 PyUnicode_GET_SIZE(unicode),
2893 NULL);
2894}
2895
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002896#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002897
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898/* --- Character Mapping Codec -------------------------------------------- */
2899
Guido van Rossumd57fd912000-03-10 22:53:23 +00002900PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002901 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902 PyObject *mapping,
2903 const char *errors)
2904{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002905 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002906 Py_ssize_t startinpos;
2907 Py_ssize_t endinpos;
2908 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002909 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910 PyUnicodeObject *v;
2911 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002912 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002913 PyObject *errorHandler = NULL;
2914 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002915 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002916 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002917
Guido van Rossumd57fd912000-03-10 22:53:23 +00002918 /* Default to Latin-1 */
2919 if (mapping == NULL)
2920 return PyUnicode_DecodeLatin1(s, size, errors);
2921
2922 v = _PyUnicode_New(size);
2923 if (v == NULL)
2924 goto onError;
2925 if (size == 0)
2926 return (PyObject *)v;
2927 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002928 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002929 if (PyUnicode_CheckExact(mapping)) {
2930 mapstring = PyUnicode_AS_UNICODE(mapping);
2931 maplen = PyUnicode_GET_SIZE(mapping);
2932 while (s < e) {
2933 unsigned char ch = *s;
2934 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002936 if (ch < maplen)
2937 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002938
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002939 if (x == 0xfffe) {
2940 /* undefined mapping */
2941 outpos = p-PyUnicode_AS_UNICODE(v);
2942 startinpos = s-starts;
2943 endinpos = startinpos+1;
2944 if (unicode_decode_call_errorhandler(
2945 errors, &errorHandler,
2946 "charmap", "character maps to <undefined>",
2947 starts, size, &startinpos, &endinpos, &exc, &s,
2948 (PyObject **)&v, &outpos, &p)) {
2949 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002950 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002951 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002952 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002953 *p++ = x;
2954 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002956 }
2957 else {
2958 while (s < e) {
2959 unsigned char ch = *s;
2960 PyObject *w, *x;
2961
2962 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2963 w = PyInt_FromLong((long)ch);
2964 if (w == NULL)
2965 goto onError;
2966 x = PyObject_GetItem(mapping, w);
2967 Py_DECREF(w);
2968 if (x == NULL) {
2969 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2970 /* No mapping found means: mapping is undefined. */
2971 PyErr_Clear();
2972 x = Py_None;
2973 Py_INCREF(x);
2974 } else
2975 goto onError;
2976 }
2977
2978 /* Apply mapping */
2979 if (PyInt_Check(x)) {
2980 long value = PyInt_AS_LONG(x);
2981 if (value < 0 || value > 65535) {
2982 PyErr_SetString(PyExc_TypeError,
2983 "character mapping must be in range(65536)");
2984 Py_DECREF(x);
2985 goto onError;
2986 }
2987 *p++ = (Py_UNICODE)value;
2988 }
2989 else if (x == Py_None) {
2990 /* undefined mapping */
2991 outpos = p-PyUnicode_AS_UNICODE(v);
2992 startinpos = s-starts;
2993 endinpos = startinpos+1;
2994 if (unicode_decode_call_errorhandler(
2995 errors, &errorHandler,
2996 "charmap", "character maps to <undefined>",
2997 starts, size, &startinpos, &endinpos, &exc, &s,
2998 (PyObject **)&v, &outpos, &p)) {
2999 Py_DECREF(x);
3000 goto onError;
3001 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003002 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003003 continue;
3004 }
3005 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003006 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003007
3008 if (targetsize == 1)
3009 /* 1-1 mapping */
3010 *p++ = *PyUnicode_AS_UNICODE(x);
3011
3012 else if (targetsize > 1) {
3013 /* 1-n mapping */
3014 if (targetsize > extrachars) {
3015 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003016 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3017 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003018 (targetsize << 2);
3019 extrachars += needed;
3020 if (_PyUnicode_Resize(&v,
3021 PyUnicode_GET_SIZE(v) + needed) < 0) {
3022 Py_DECREF(x);
3023 goto onError;
3024 }
3025 p = PyUnicode_AS_UNICODE(v) + oldpos;
3026 }
3027 Py_UNICODE_COPY(p,
3028 PyUnicode_AS_UNICODE(x),
3029 targetsize);
3030 p += targetsize;
3031 extrachars -= targetsize;
3032 }
3033 /* 1-0 mapping: skip the character */
3034 }
3035 else {
3036 /* wrong return value */
3037 PyErr_SetString(PyExc_TypeError,
3038 "character mapping must return integer, None or unicode");
3039 Py_DECREF(x);
3040 goto onError;
3041 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003043 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045 }
3046 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003047 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003049 Py_XDECREF(errorHandler);
3050 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003052
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003054 Py_XDECREF(errorHandler);
3055 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 Py_XDECREF(v);
3057 return NULL;
3058}
3059
Martin v. Löwis3f767792006-06-04 19:36:28 +00003060/* Charmap encoding: the lookup table */
3061
3062struct encoding_map{
3063 PyObject_HEAD
3064 unsigned char level1[32];
3065 int count2, count3;
3066 unsigned char level23[1];
3067};
3068
3069static PyObject*
3070encoding_map_size(PyObject *obj, PyObject* args)
3071{
3072 struct encoding_map *map = (struct encoding_map*)obj;
3073 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3074 128*map->count3);
3075}
3076
3077static PyMethodDef encoding_map_methods[] = {
3078 {"size", encoding_map_size, METH_NOARGS,
3079 PyDoc_STR("Return the size (in bytes) of this object") },
3080 { 0 }
3081};
3082
3083static void
3084encoding_map_dealloc(PyObject* o)
3085{
3086 PyObject_FREE(o);
3087}
3088
3089static PyTypeObject EncodingMapType = {
3090 PyObject_HEAD_INIT(NULL)
3091 0, /*ob_size*/
3092 "EncodingMap", /*tp_name*/
3093 sizeof(struct encoding_map), /*tp_basicsize*/
3094 0, /*tp_itemsize*/
3095 /* methods */
3096 encoding_map_dealloc, /*tp_dealloc*/
3097 0, /*tp_print*/
3098 0, /*tp_getattr*/
3099 0, /*tp_setattr*/
3100 0, /*tp_compare*/
3101 0, /*tp_repr*/
3102 0, /*tp_as_number*/
3103 0, /*tp_as_sequence*/
3104 0, /*tp_as_mapping*/
3105 0, /*tp_hash*/
3106 0, /*tp_call*/
3107 0, /*tp_str*/
3108 0, /*tp_getattro*/
3109 0, /*tp_setattro*/
3110 0, /*tp_as_buffer*/
3111 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3112 0, /*tp_doc*/
3113 0, /*tp_traverse*/
3114 0, /*tp_clear*/
3115 0, /*tp_richcompare*/
3116 0, /*tp_weaklistoffset*/
3117 0, /*tp_iter*/
3118 0, /*tp_iternext*/
3119 encoding_map_methods, /*tp_methods*/
3120 0, /*tp_members*/
3121 0, /*tp_getset*/
3122 0, /*tp_base*/
3123 0, /*tp_dict*/
3124 0, /*tp_descr_get*/
3125 0, /*tp_descr_set*/
3126 0, /*tp_dictoffset*/
3127 0, /*tp_init*/
3128 0, /*tp_alloc*/
3129 0, /*tp_new*/
3130 0, /*tp_free*/
3131 0, /*tp_is_gc*/
3132};
3133
3134PyObject*
3135PyUnicode_BuildEncodingMap(PyObject* string)
3136{
3137 Py_UNICODE *decode;
3138 PyObject *result;
3139 struct encoding_map *mresult;
3140 int i;
3141 int need_dict = 0;
3142 unsigned char level1[32];
3143 unsigned char level2[512];
3144 unsigned char *mlevel1, *mlevel2, *mlevel3;
3145 int count2 = 0, count3 = 0;
3146
3147 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3148 PyErr_BadArgument();
3149 return NULL;
3150 }
3151 decode = PyUnicode_AS_UNICODE(string);
3152 memset(level1, 0xFF, sizeof level1);
3153 memset(level2, 0xFF, sizeof level2);
3154
3155 /* If there isn't a one-to-one mapping of NULL to \0,
3156 or if there are non-BMP characters, we need to use
3157 a mapping dictionary. */
3158 if (decode[0] != 0)
3159 need_dict = 1;
3160 for (i = 1; i < 256; i++) {
3161 int l1, l2;
3162 if (decode[i] == 0
3163 #ifdef Py_UNICODE_WIDE
3164 || decode[i] > 0xFFFF
3165 #endif
3166 ) {
3167 need_dict = 1;
3168 break;
3169 }
3170 if (decode[i] == 0xFFFE)
3171 /* unmapped character */
3172 continue;
3173 l1 = decode[i] >> 11;
3174 l2 = decode[i] >> 7;
3175 if (level1[l1] == 0xFF)
3176 level1[l1] = count2++;
3177 if (level2[l2] == 0xFF)
3178 level2[l2] = count3++;
3179 }
3180
3181 if (count2 >= 0xFF || count3 >= 0xFF)
3182 need_dict = 1;
3183
3184 if (need_dict) {
3185 PyObject *result = PyDict_New();
3186 PyObject *key, *value;
3187 if (!result)
3188 return NULL;
3189 for (i = 0; i < 256; i++) {
3190 key = value = NULL;
3191 key = PyInt_FromLong(decode[i]);
3192 value = PyInt_FromLong(i);
3193 if (!key || !value)
3194 goto failed1;
3195 if (PyDict_SetItem(result, key, value) == -1)
3196 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00003197 Py_DECREF(key);
3198 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003199 }
3200 return result;
3201 failed1:
3202 Py_XDECREF(key);
3203 Py_XDECREF(value);
3204 Py_DECREF(result);
3205 return NULL;
3206 }
3207
3208 /* Create a three-level trie */
3209 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3210 16*count2 + 128*count3 - 1);
3211 if (!result)
3212 return PyErr_NoMemory();
3213 PyObject_Init(result, &EncodingMapType);
3214 mresult = (struct encoding_map*)result;
3215 mresult->count2 = count2;
3216 mresult->count3 = count3;
3217 mlevel1 = mresult->level1;
3218 mlevel2 = mresult->level23;
3219 mlevel3 = mresult->level23 + 16*count2;
3220 memcpy(mlevel1, level1, 32);
3221 memset(mlevel2, 0xFF, 16*count2);
3222 memset(mlevel3, 0, 128*count3);
3223 count3 = 0;
3224 for (i = 1; i < 256; i++) {
3225 int o1, o2, o3, i2, i3;
3226 if (decode[i] == 0xFFFE)
3227 /* unmapped character */
3228 continue;
3229 o1 = decode[i]>>11;
3230 o2 = (decode[i]>>7) & 0xF;
3231 i2 = 16*mlevel1[o1] + o2;
3232 if (mlevel2[i2] == 0xFF)
3233 mlevel2[i2] = count3++;
3234 o3 = decode[i] & 0x7F;
3235 i3 = 128*mlevel2[i2] + o3;
3236 mlevel3[i3] = i;
3237 }
3238 return result;
3239}
3240
3241static int
3242encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3243{
3244 struct encoding_map *map = (struct encoding_map*)mapping;
3245 int l1 = c>>11;
3246 int l2 = (c>>7) & 0xF;
3247 int l3 = c & 0x7F;
3248 int i;
3249
3250#ifdef Py_UNICODE_WIDE
3251 if (c > 0xFFFF) {
3252 return -1;
3253 }
3254#endif
3255 if (c == 0)
3256 return 0;
3257 /* level 1*/
3258 i = map->level1[l1];
3259 if (i == 0xFF) {
3260 return -1;
3261 }
3262 /* level 2*/
3263 i = map->level23[16*i+l2];
3264 if (i == 0xFF) {
3265 return -1;
3266 }
3267 /* level 3 */
3268 i = map->level23[16*map->count2 + 128*i + l3];
3269 if (i == 0) {
3270 return -1;
3271 }
3272 return i;
3273}
3274
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003275/* Lookup the character ch in the mapping. If the character
3276 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003277 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003278static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003279{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003280 PyObject *w = PyInt_FromLong((long)c);
3281 PyObject *x;
3282
3283 if (w == NULL)
3284 return NULL;
3285 x = PyObject_GetItem(mapping, w);
3286 Py_DECREF(w);
3287 if (x == NULL) {
3288 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3289 /* No mapping found means: mapping is undefined. */
3290 PyErr_Clear();
3291 x = Py_None;
3292 Py_INCREF(x);
3293 return x;
3294 } else
3295 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003297 else if (x == Py_None)
3298 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003299 else if (PyInt_Check(x)) {
3300 long value = PyInt_AS_LONG(x);
3301 if (value < 0 || value > 255) {
3302 PyErr_SetString(PyExc_TypeError,
3303 "character mapping must be in range(256)");
3304 Py_DECREF(x);
3305 return NULL;
3306 }
3307 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003309 else if (PyString_Check(x))
3310 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003312 /* wrong return value */
3313 PyErr_SetString(PyExc_TypeError,
3314 "character mapping must return integer, None or str");
3315 Py_DECREF(x);
3316 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317 }
3318}
3319
Martin v. Löwis3f767792006-06-04 19:36:28 +00003320static int
3321charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3322{
3323 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3324 /* exponentially overallocate to minimize reallocations */
3325 if (requiredsize < 2*outsize)
3326 requiredsize = 2*outsize;
3327 if (_PyString_Resize(outobj, requiredsize)) {
3328 return 0;
3329 }
3330 return 1;
3331}
3332
3333typedef enum charmapencode_result {
3334 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3335}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003336/* lookup the character, put the result in the output string and adjust
3337 various state variables. Reallocate the output string if not enough
3338 space is available. Return a new reference to the object that
3339 was put in the output buffer, or Py_None, if the mapping was undefined
3340 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003341 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342static
Martin v. Löwis3f767792006-06-04 19:36:28 +00003343charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003344 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003345{
Martin v. Löwis3f767792006-06-04 19:36:28 +00003346 PyObject *rep;
3347 char *outstart;
3348 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003349
Martin v. Löwis3f767792006-06-04 19:36:28 +00003350 if (mapping->ob_type == &EncodingMapType) {
3351 int res = encoding_map_lookup(c, mapping);
3352 Py_ssize_t requiredsize = *outpos+1;
3353 if (res == -1)
3354 return enc_FAILED;
3355 if (outsize<requiredsize)
3356 if (!charmapencode_resize(outobj, outpos, requiredsize))
3357 return enc_EXCEPTION;
3358 outstart = PyString_AS_STRING(*outobj);
3359 outstart[(*outpos)++] = (char)res;
3360 return enc_SUCCESS;
3361 }
3362
3363 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003364 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003365 return enc_EXCEPTION;
3366 else if (rep==Py_None) {
3367 Py_DECREF(rep);
3368 return enc_FAILED;
3369 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003370 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003371 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003372 if (outsize<requiredsize)
3373 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003374 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003375 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003376 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003377 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3379 }
3380 else {
3381 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003382 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3383 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003384 if (outsize<requiredsize)
3385 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003386 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003387 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003388 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003389 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 memcpy(outstart + *outpos, repchars, repsize);
3391 *outpos += repsize;
3392 }
3393 }
Georg Brandl9f167602006-06-04 21:46:16 +00003394 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003395 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003396}
3397
3398/* handle an error in PyUnicode_EncodeCharmap
3399 Return 0 on success, -1 on error */
3400static
3401int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003402 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003403 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003404 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003405 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003406{
3407 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003408 Py_ssize_t repsize;
3409 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003410 Py_UNICODE *uni2;
3411 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003412 Py_ssize_t collstartpos = *inpos;
3413 Py_ssize_t collendpos = *inpos+1;
3414 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003415 char *encoding = "charmap";
3416 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00003417 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003418
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419 /* find all unencodable characters */
3420 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003421 PyObject *rep;
3422 if (mapping->ob_type == &EncodingMapType) {
3423 int res = encoding_map_lookup(p[collendpos], mapping);
3424 if (res != -1)
3425 break;
3426 ++collendpos;
3427 continue;
3428 }
3429
3430 rep = charmapencode_lookup(p[collendpos], mapping);
3431 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003432 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003433 else if (rep!=Py_None) {
3434 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003435 break;
3436 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003437 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003438 ++collendpos;
3439 }
3440 /* cache callback name lookup
3441 * (if not done yet, i.e. it's the first error) */
3442 if (*known_errorHandler==-1) {
3443 if ((errors==NULL) || (!strcmp(errors, "strict")))
3444 *known_errorHandler = 1;
3445 else if (!strcmp(errors, "replace"))
3446 *known_errorHandler = 2;
3447 else if (!strcmp(errors, "ignore"))
3448 *known_errorHandler = 3;
3449 else if (!strcmp(errors, "xmlcharrefreplace"))
3450 *known_errorHandler = 4;
3451 else
3452 *known_errorHandler = 0;
3453 }
3454 switch (*known_errorHandler) {
3455 case 1: /* strict */
3456 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3457 return -1;
3458 case 2: /* replace */
3459 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3460 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003461 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003462 return -1;
3463 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003464 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003465 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3466 return -1;
3467 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003468 }
3469 /* fall through */
3470 case 3: /* ignore */
3471 *inpos = collendpos;
3472 break;
3473 case 4: /* xmlcharrefreplace */
3474 /* generate replacement (temporarily (mis)uses p) */
3475 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3476 char buffer[2+29+1+1];
3477 char *cp;
3478 sprintf(buffer, "&#%d;", (int)p[collpos]);
3479 for (cp = buffer; *cp; ++cp) {
3480 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003481 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003482 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003483 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003484 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3485 return -1;
3486 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487 }
3488 }
3489 *inpos = collendpos;
3490 break;
3491 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003492 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003493 encoding, reason, p, size, exceptionObject,
3494 collstartpos, collendpos, &newpos);
3495 if (repunicode == NULL)
3496 return -1;
3497 /* generate replacement */
3498 repsize = PyUnicode_GET_SIZE(repunicode);
3499 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3500 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003501 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502 return -1;
3503 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003504 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3507 return -1;
3508 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509 }
3510 *inpos = newpos;
3511 Py_DECREF(repunicode);
3512 }
3513 return 0;
3514}
3515
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003517 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003518 PyObject *mapping,
3519 const char *errors)
3520{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 /* output object */
3522 PyObject *res = NULL;
3523 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003524 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003526 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527 PyObject *errorHandler = NULL;
3528 PyObject *exc = NULL;
3529 /* the following variable is used for caching string comparisons
3530 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3531 * 3=ignore, 4=xmlcharrefreplace */
3532 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533
3534 /* Default to Latin-1 */
3535 if (mapping == NULL)
3536 return PyUnicode_EncodeLatin1(p, size, errors);
3537
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538 /* allocate enough for a simple encoding without
3539 replacements, if we need more, we'll resize */
3540 res = PyString_FromStringAndSize(NULL, size);
3541 if (res == NULL)
3542 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003543 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003545
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 while (inpos<size) {
3547 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00003548 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3549 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003550 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003551 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003552 if (charmap_encoding_error(p, size, &inpos, mapping,
3553 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003554 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003555 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003556 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 else
3560 /* done with this character => adjust input position */
3561 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003564 /* Resize if we allocated to much */
3565 if (respos<PyString_GET_SIZE(res)) {
3566 if (_PyString_Resize(&res, respos))
3567 goto onError;
3568 }
3569 Py_XDECREF(exc);
3570 Py_XDECREF(errorHandler);
3571 return res;
3572
3573 onError:
3574 Py_XDECREF(res);
3575 Py_XDECREF(exc);
3576 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003577 return NULL;
3578}
3579
3580PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3581 PyObject *mapping)
3582{
3583 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3584 PyErr_BadArgument();
3585 return NULL;
3586 }
3587 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3588 PyUnicode_GET_SIZE(unicode),
3589 mapping,
3590 NULL);
3591}
3592
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593/* create or adjust a UnicodeTranslateError */
3594static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003595 const Py_UNICODE *unicode, Py_ssize_t size,
3596 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003598{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 if (*exceptionObject == NULL) {
3600 *exceptionObject = PyUnicodeTranslateError_Create(
3601 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003602 }
3603 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003604 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3605 goto onError;
3606 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3607 goto onError;
3608 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3609 goto onError;
3610 return;
3611 onError:
3612 Py_DECREF(*exceptionObject);
3613 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614 }
3615}
3616
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003617/* raises a UnicodeTranslateError */
3618static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003619 const Py_UNICODE *unicode, Py_ssize_t size,
3620 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621 const char *reason)
3622{
3623 make_translate_exception(exceptionObject,
3624 unicode, size, startpos, endpos, reason);
3625 if (*exceptionObject != NULL)
3626 PyCodec_StrictErrors(*exceptionObject);
3627}
3628
3629/* error handling callback helper:
3630 build arguments, call the callback and check the arguments,
3631 put the result into newpos and return the replacement string, which
3632 has to be freed by the caller */
3633static PyObject *unicode_translate_call_errorhandler(const char *errors,
3634 PyObject **errorHandler,
3635 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003636 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3637 Py_ssize_t startpos, Py_ssize_t endpos,
3638 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003639{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003640 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003641
Martin v. Löwis412fb672006-04-13 06:34:32 +00003642 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003643 PyObject *restuple;
3644 PyObject *resunicode;
3645
3646 if (*errorHandler == NULL) {
3647 *errorHandler = PyCodec_LookupError(errors);
3648 if (*errorHandler == NULL)
3649 return NULL;
3650 }
3651
3652 make_translate_exception(exceptionObject,
3653 unicode, size, startpos, endpos, reason);
3654 if (*exceptionObject == NULL)
3655 return NULL;
3656
3657 restuple = PyObject_CallFunctionObjArgs(
3658 *errorHandler, *exceptionObject, NULL);
3659 if (restuple == NULL)
3660 return NULL;
3661 if (!PyTuple_Check(restuple)) {
3662 PyErr_Format(PyExc_TypeError, &argparse[4]);
3663 Py_DECREF(restuple);
3664 return NULL;
3665 }
3666 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003667 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003668 Py_DECREF(restuple);
3669 return NULL;
3670 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003671 if (i_newpos<0)
3672 *newpos = size+i_newpos;
3673 else
3674 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003675 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003676 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003677 Py_DECREF(restuple);
3678 return NULL;
3679 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 Py_INCREF(resunicode);
3681 Py_DECREF(restuple);
3682 return resunicode;
3683}
3684
3685/* Lookup the character ch in the mapping and put the result in result,
3686 which must be decrefed by the caller.
3687 Return 0 on success, -1 on error */
3688static
3689int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3690{
3691 PyObject *w = PyInt_FromLong((long)c);
3692 PyObject *x;
3693
3694 if (w == NULL)
3695 return -1;
3696 x = PyObject_GetItem(mapping, w);
3697 Py_DECREF(w);
3698 if (x == NULL) {
3699 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3700 /* No mapping found means: use 1:1 mapping. */
3701 PyErr_Clear();
3702 *result = NULL;
3703 return 0;
3704 } else
3705 return -1;
3706 }
3707 else if (x == Py_None) {
3708 *result = x;
3709 return 0;
3710 }
3711 else if (PyInt_Check(x)) {
3712 long value = PyInt_AS_LONG(x);
3713 long max = PyUnicode_GetMax();
3714 if (value < 0 || value > max) {
3715 PyErr_Format(PyExc_TypeError,
3716 "character mapping must be in range(0x%lx)", max+1);
3717 Py_DECREF(x);
3718 return -1;
3719 }
3720 *result = x;
3721 return 0;
3722 }
3723 else if (PyUnicode_Check(x)) {
3724 *result = x;
3725 return 0;
3726 }
3727 else {
3728 /* wrong return value */
3729 PyErr_SetString(PyExc_TypeError,
3730 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003731 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003732 return -1;
3733 }
3734}
3735/* ensure that *outobj is at least requiredsize characters long,
3736if not reallocate and adjust various state variables.
3737Return 0 on success, -1 on error */
3738static
Walter Dörwald4894c302003-10-24 14:25:28 +00003739int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003740 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003741{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003742 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003743 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003744 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003745 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003746 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003747 if (requiredsize < 2 * oldsize)
3748 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003749 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003750 return -1;
3751 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003752 }
3753 return 0;
3754}
3755/* lookup the character, put the result in the output string and adjust
3756 various state variables. Return a new reference to the object that
3757 was put in the output buffer in *result, or Py_None, if the mapping was
3758 undefined (in which case no character was written).
3759 The called must decref result.
3760 Return 0 on success, -1 on error. */
3761static
Walter Dörwald4894c302003-10-24 14:25:28 +00003762int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003763 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003764 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765{
Walter Dörwald4894c302003-10-24 14:25:28 +00003766 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003767 return -1;
3768 if (*res==NULL) {
3769 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003770 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003771 }
3772 else if (*res==Py_None)
3773 ;
3774 else if (PyInt_Check(*res)) {
3775 /* no overflow check, because we know that the space is enough */
3776 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3777 }
3778 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003779 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003780 if (repsize==1) {
3781 /* no overflow check, because we know that the space is enough */
3782 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3783 }
3784 else if (repsize!=0) {
3785 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003786 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003787 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003788 repsize - 1;
3789 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003790 return -1;
3791 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3792 *outp += repsize;
3793 }
3794 }
3795 else
3796 return -1;
3797 return 0;
3798}
3799
3800PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003801 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 PyObject *mapping,
3803 const char *errors)
3804{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003805 /* output object */
3806 PyObject *res = NULL;
3807 /* pointers to the beginning and end+1 of input */
3808 const Py_UNICODE *startp = p;
3809 const Py_UNICODE *endp = p + size;
3810 /* pointer into the output */
3811 Py_UNICODE *str;
3812 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003813 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003814 char *reason = "character maps to <undefined>";
3815 PyObject *errorHandler = NULL;
3816 PyObject *exc = NULL;
3817 /* the following variable is used for caching string comparisons
3818 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3819 * 3=ignore, 4=xmlcharrefreplace */
3820 int known_errorHandler = -1;
3821
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822 if (mapping == NULL) {
3823 PyErr_BadArgument();
3824 return NULL;
3825 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003826
3827 /* allocate enough for a simple 1:1 translation without
3828 replacements, if we need more, we'll resize */
3829 res = PyUnicode_FromUnicode(NULL, size);
3830 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003831 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003833 return res;
3834 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003836 while (p<endp) {
3837 /* try to encode it */
3838 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003839 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003840 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 goto onError;
3842 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003843 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003844 if (x!=Py_None) /* it worked => adjust input pointer */
3845 ++p;
3846 else { /* untranslatable character */
3847 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003848 Py_ssize_t repsize;
3849 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003850 Py_UNICODE *uni2;
3851 /* startpos for collecting untranslatable chars */
3852 const Py_UNICODE *collstart = p;
3853 const Py_UNICODE *collend = p+1;
3854 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003856 /* find all untranslatable characters */
3857 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003858 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003859 goto onError;
3860 Py_XDECREF(x);
3861 if (x!=Py_None)
3862 break;
3863 ++collend;
3864 }
3865 /* cache callback name lookup
3866 * (if not done yet, i.e. it's the first error) */
3867 if (known_errorHandler==-1) {
3868 if ((errors==NULL) || (!strcmp(errors, "strict")))
3869 known_errorHandler = 1;
3870 else if (!strcmp(errors, "replace"))
3871 known_errorHandler = 2;
3872 else if (!strcmp(errors, "ignore"))
3873 known_errorHandler = 3;
3874 else if (!strcmp(errors, "xmlcharrefreplace"))
3875 known_errorHandler = 4;
3876 else
3877 known_errorHandler = 0;
3878 }
3879 switch (known_errorHandler) {
3880 case 1: /* strict */
3881 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3882 goto onError;
3883 case 2: /* replace */
3884 /* No need to check for space, this is a 1:1 replacement */
3885 for (coll = collstart; coll<collend; ++coll)
3886 *str++ = '?';
3887 /* fall through */
3888 case 3: /* ignore */
3889 p = collend;
3890 break;
3891 case 4: /* xmlcharrefreplace */
3892 /* generate replacement (temporarily (mis)uses p) */
3893 for (p = collstart; p < collend; ++p) {
3894 char buffer[2+29+1+1];
3895 char *cp;
3896 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003897 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003898 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3899 goto onError;
3900 for (cp = buffer; *cp; ++cp)
3901 *str++ = *cp;
3902 }
3903 p = collend;
3904 break;
3905 default:
3906 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3907 reason, startp, size, &exc,
3908 collstart-startp, collend-startp, &newpos);
3909 if (repunicode == NULL)
3910 goto onError;
3911 /* generate replacement */
3912 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003913 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3915 Py_DECREF(repunicode);
3916 goto onError;
3917 }
3918 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3919 *str++ = *uni2;
3920 p = startp + newpos;
3921 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003922 }
3923 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003924 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003925 /* Resize if we allocated to much */
3926 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003927 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003928 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003929 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003930 }
3931 Py_XDECREF(exc);
3932 Py_XDECREF(errorHandler);
3933 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 onError:
3936 Py_XDECREF(res);
3937 Py_XDECREF(exc);
3938 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003939 return NULL;
3940}
3941
3942PyObject *PyUnicode_Translate(PyObject *str,
3943 PyObject *mapping,
3944 const char *errors)
3945{
3946 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003947
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948 str = PyUnicode_FromObject(str);
3949 if (str == NULL)
3950 goto onError;
3951 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3952 PyUnicode_GET_SIZE(str),
3953 mapping,
3954 errors);
3955 Py_DECREF(str);
3956 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003957
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 onError:
3959 Py_XDECREF(str);
3960 return NULL;
3961}
Tim Petersced69f82003-09-16 20:30:58 +00003962
Guido van Rossum9e896b32000-04-05 20:11:21 +00003963/* --- Decimal Encoder ---------------------------------------------------- */
3964
3965int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003966 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00003967 char *output,
3968 const char *errors)
3969{
3970 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003971 PyObject *errorHandler = NULL;
3972 PyObject *exc = NULL;
3973 const char *encoding = "decimal";
3974 const char *reason = "invalid decimal Unicode string";
3975 /* the following variable is used for caching string comparisons
3976 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3977 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003978
3979 if (output == NULL) {
3980 PyErr_BadArgument();
3981 return -1;
3982 }
3983
3984 p = s;
3985 end = s + length;
3986 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003987 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003988 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003989 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003990 Py_ssize_t repsize;
3991 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 Py_UNICODE *uni2;
3993 Py_UNICODE *collstart;
3994 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003995
Guido van Rossum9e896b32000-04-05 20:11:21 +00003996 if (Py_UNICODE_ISSPACE(ch)) {
3997 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003998 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003999 continue;
4000 }
4001 decimal = Py_UNICODE_TODECIMAL(ch);
4002 if (decimal >= 0) {
4003 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004004 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004005 continue;
4006 }
Guido van Rossumba477042000-04-06 18:18:10 +00004007 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004008 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004009 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004010 continue;
4011 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004012 /* All other characters are considered unencodable */
4013 collstart = p;
4014 collend = p+1;
4015 while (collend < end) {
4016 if ((0 < *collend && *collend < 256) ||
4017 !Py_UNICODE_ISSPACE(*collend) ||
4018 Py_UNICODE_TODECIMAL(*collend))
4019 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004020 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004021 /* cache callback name lookup
4022 * (if not done yet, i.e. it's the first error) */
4023 if (known_errorHandler==-1) {
4024 if ((errors==NULL) || (!strcmp(errors, "strict")))
4025 known_errorHandler = 1;
4026 else if (!strcmp(errors, "replace"))
4027 known_errorHandler = 2;
4028 else if (!strcmp(errors, "ignore"))
4029 known_errorHandler = 3;
4030 else if (!strcmp(errors, "xmlcharrefreplace"))
4031 known_errorHandler = 4;
4032 else
4033 known_errorHandler = 0;
4034 }
4035 switch (known_errorHandler) {
4036 case 1: /* strict */
4037 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4038 goto onError;
4039 case 2: /* replace */
4040 for (p = collstart; p < collend; ++p)
4041 *output++ = '?';
4042 /* fall through */
4043 case 3: /* ignore */
4044 p = collend;
4045 break;
4046 case 4: /* xmlcharrefreplace */
4047 /* generate replacement (temporarily (mis)uses p) */
4048 for (p = collstart; p < collend; ++p)
4049 output += sprintf(output, "&#%d;", (int)*p);
4050 p = collend;
4051 break;
4052 default:
4053 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4054 encoding, reason, s, length, &exc,
4055 collstart-s, collend-s, &newpos);
4056 if (repunicode == NULL)
4057 goto onError;
4058 /* generate replacement */
4059 repsize = PyUnicode_GET_SIZE(repunicode);
4060 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4061 Py_UNICODE ch = *uni2;
4062 if (Py_UNICODE_ISSPACE(ch))
4063 *output++ = ' ';
4064 else {
4065 decimal = Py_UNICODE_TODECIMAL(ch);
4066 if (decimal >= 0)
4067 *output++ = '0' + decimal;
4068 else if (0 < ch && ch < 256)
4069 *output++ = (char)ch;
4070 else {
4071 Py_DECREF(repunicode);
4072 raise_encode_exception(&exc, encoding,
4073 s, length, collstart-s, collend-s, reason);
4074 goto onError;
4075 }
4076 }
4077 }
4078 p = s + newpos;
4079 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004080 }
4081 }
4082 /* 0-terminate the output string */
4083 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004084 Py_XDECREF(exc);
4085 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004086 return 0;
4087
4088 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089 Py_XDECREF(exc);
4090 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004091 return -1;
4092}
4093
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094/* --- Helpers ------------------------------------------------------------ */
4095
Fredrik Lundha50d2012006-05-26 17:04:58 +00004096#define STRINGLIB_CHAR Py_UNICODE
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004097
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004098#define STRINGLIB_LEN PyUnicode_GET_SIZE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004099#define STRINGLIB_NEW PyUnicode_FromUnicode
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004100#define STRINGLIB_STR PyUnicode_AS_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004101
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004102Py_LOCAL_INLINE(int)
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004103STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4104{
Fredrik Lundh9c0e9c02006-05-26 18:24:15 +00004105 if (str[0] != other[0])
4106 return 1;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004107 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4108}
4109
Fredrik Lundhb9479482006-05-26 17:22:38 +00004110#define STRINGLIB_EMPTY unicode_empty
4111
Fredrik Lundha50d2012006-05-26 17:04:58 +00004112#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004113
4114#include "stringlib/count.h"
4115#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00004116#include "stringlib/partition.h"
4117
Fredrik Lundhc8162812006-05-26 19:33:03 +00004118/* helper macro to fixup start/end slice values */
4119#define FIX_START_END(obj) \
4120 if (start < 0) \
4121 start += (obj)->length; \
4122 if (start < 0) \
4123 start = 0; \
4124 if (end > (obj)->length) \
4125 end = (obj)->length; \
4126 if (end < 0) \
4127 end += (obj)->length; \
4128 if (end < 0) \
4129 end = 0;
4130
Martin v. Löwis18e16552006-02-15 17:27:45 +00004131Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004132 PyObject *substr,
4133 Py_ssize_t start,
4134 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004136 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004137 PyUnicodeObject* str_obj;
4138 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004139
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004140 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4141 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004143 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4144 if (!sub_obj) {
4145 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146 return -1;
4147 }
Tim Petersced69f82003-09-16 20:30:58 +00004148
Fredrik Lundhc8162812006-05-26 19:33:03 +00004149 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004150
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004151 result = stringlib_count(
4152 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4153 );
4154
4155 Py_DECREF(sub_obj);
4156 Py_DECREF(str_obj);
4157
Guido van Rossumd57fd912000-03-10 22:53:23 +00004158 return result;
4159}
4160
Martin v. Löwis18e16552006-02-15 17:27:45 +00004161Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004162 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004163 Py_ssize_t start,
4164 Py_ssize_t end,
4165 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004167 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004168
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004169 str = PyUnicode_FromObject(str);
4170 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004171 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004172 sub = PyUnicode_FromObject(sub);
4173 if (!sub) {
4174 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004175 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176 }
Tim Petersced69f82003-09-16 20:30:58 +00004177
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004178 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004179 result = stringlib_find_slice(
4180 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4181 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4182 start, end
4183 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004184 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004185 result = stringlib_rfind_slice(
4186 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4187 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4188 start, end
4189 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004190
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004191 Py_DECREF(str);
4192 Py_DECREF(sub);
4193
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194 return result;
4195}
4196
Tim Petersced69f82003-09-16 20:30:58 +00004197static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198int tailmatch(PyUnicodeObject *self,
4199 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004200 Py_ssize_t start,
4201 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004202 int direction)
4203{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004204 if (substring->length == 0)
4205 return 1;
4206
Fredrik Lundhc8162812006-05-26 19:33:03 +00004207 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004208
4209 end -= substring->length;
4210 if (end < start)
4211 return 0;
4212
4213 if (direction > 0) {
4214 if (Py_UNICODE_MATCH(self, end, substring))
4215 return 1;
4216 } else {
4217 if (Py_UNICODE_MATCH(self, start, substring))
4218 return 1;
4219 }
4220
4221 return 0;
4222}
4223
Martin v. Löwis18e16552006-02-15 17:27:45 +00004224Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004225 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004226 Py_ssize_t start,
4227 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 int direction)
4229{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004230 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004231
Guido van Rossumd57fd912000-03-10 22:53:23 +00004232 str = PyUnicode_FromObject(str);
4233 if (str == NULL)
4234 return -1;
4235 substr = PyUnicode_FromObject(substr);
4236 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004237 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004238 return -1;
4239 }
Tim Petersced69f82003-09-16 20:30:58 +00004240
Guido van Rossumd57fd912000-03-10 22:53:23 +00004241 result = tailmatch((PyUnicodeObject *)str,
4242 (PyUnicodeObject *)substr,
4243 start, end, direction);
4244 Py_DECREF(str);
4245 Py_DECREF(substr);
4246 return result;
4247}
4248
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249/* Apply fixfct filter to the Unicode object self and return a
4250 reference to the modified object */
4251
Tim Petersced69f82003-09-16 20:30:58 +00004252static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253PyObject *fixup(PyUnicodeObject *self,
4254 int (*fixfct)(PyUnicodeObject *s))
4255{
4256
4257 PyUnicodeObject *u;
4258
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004259 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260 if (u == NULL)
4261 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004262
4263 Py_UNICODE_COPY(u->str, self->str, self->length);
4264
Tim Peters7a29bd52001-09-12 03:03:31 +00004265 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004266 /* fixfct should return TRUE if it modified the buffer. If
4267 FALSE, return a reference to the original buffer instead
4268 (to save space, not time) */
4269 Py_INCREF(self);
4270 Py_DECREF(u);
4271 return (PyObject*) self;
4272 }
4273 return (PyObject*) u;
4274}
4275
Tim Petersced69f82003-09-16 20:30:58 +00004276static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277int fixupper(PyUnicodeObject *self)
4278{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004279 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280 Py_UNICODE *s = self->str;
4281 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004282
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283 while (len-- > 0) {
4284 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004285
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286 ch = Py_UNICODE_TOUPPER(*s);
4287 if (ch != *s) {
4288 status = 1;
4289 *s = ch;
4290 }
4291 s++;
4292 }
4293
4294 return status;
4295}
4296
Tim Petersced69f82003-09-16 20:30:58 +00004297static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004298int fixlower(PyUnicodeObject *self)
4299{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004300 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004301 Py_UNICODE *s = self->str;
4302 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004303
Guido van Rossumd57fd912000-03-10 22:53:23 +00004304 while (len-- > 0) {
4305 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004306
Guido van Rossumd57fd912000-03-10 22:53:23 +00004307 ch = Py_UNICODE_TOLOWER(*s);
4308 if (ch != *s) {
4309 status = 1;
4310 *s = ch;
4311 }
4312 s++;
4313 }
4314
4315 return status;
4316}
4317
Tim Petersced69f82003-09-16 20:30:58 +00004318static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319int fixswapcase(PyUnicodeObject *self)
4320{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004321 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322 Py_UNICODE *s = self->str;
4323 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004324
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325 while (len-- > 0) {
4326 if (Py_UNICODE_ISUPPER(*s)) {
4327 *s = Py_UNICODE_TOLOWER(*s);
4328 status = 1;
4329 } else if (Py_UNICODE_ISLOWER(*s)) {
4330 *s = Py_UNICODE_TOUPPER(*s);
4331 status = 1;
4332 }
4333 s++;
4334 }
4335
4336 return status;
4337}
4338
Tim Petersced69f82003-09-16 20:30:58 +00004339static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340int fixcapitalize(PyUnicodeObject *self)
4341{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004342 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004343 Py_UNICODE *s = self->str;
4344 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004345
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004346 if (len == 0)
4347 return 0;
4348 if (Py_UNICODE_ISLOWER(*s)) {
4349 *s = Py_UNICODE_TOUPPER(*s);
4350 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004352 s++;
4353 while (--len > 0) {
4354 if (Py_UNICODE_ISUPPER(*s)) {
4355 *s = Py_UNICODE_TOLOWER(*s);
4356 status = 1;
4357 }
4358 s++;
4359 }
4360 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361}
4362
4363static
4364int fixtitle(PyUnicodeObject *self)
4365{
4366 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4367 register Py_UNICODE *e;
4368 int previous_is_cased;
4369
4370 /* Shortcut for single character strings */
4371 if (PyUnicode_GET_SIZE(self) == 1) {
4372 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4373 if (*p != ch) {
4374 *p = ch;
4375 return 1;
4376 }
4377 else
4378 return 0;
4379 }
Tim Petersced69f82003-09-16 20:30:58 +00004380
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381 e = p + PyUnicode_GET_SIZE(self);
4382 previous_is_cased = 0;
4383 for (; p < e; p++) {
4384 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004385
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386 if (previous_is_cased)
4387 *p = Py_UNICODE_TOLOWER(ch);
4388 else
4389 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004390
4391 if (Py_UNICODE_ISLOWER(ch) ||
4392 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393 Py_UNICODE_ISTITLE(ch))
4394 previous_is_cased = 1;
4395 else
4396 previous_is_cased = 0;
4397 }
4398 return 1;
4399}
4400
Tim Peters8ce9f162004-08-27 01:49:32 +00004401PyObject *
4402PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403{
Tim Peters8ce9f162004-08-27 01:49:32 +00004404 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004405 const Py_UNICODE blank = ' ';
4406 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004407 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004408 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004409 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4410 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004411 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4412 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004413 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004414 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004415 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416
Tim Peters05eba1f2004-08-27 21:32:02 +00004417 fseq = PySequence_Fast(seq, "");
4418 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004419 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004420 }
4421
Tim Peters91879ab2004-08-27 22:35:44 +00004422 /* Grrrr. A codec may be invoked to convert str objects to
4423 * Unicode, and so it's possible to call back into Python code
4424 * during PyUnicode_FromObject(), and so it's possible for a sick
4425 * codec to change the size of fseq (if seq is a list). Therefore
4426 * we have to keep refetching the size -- can't assume seqlen
4427 * is invariant.
4428 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004429 seqlen = PySequence_Fast_GET_SIZE(fseq);
4430 /* If empty sequence, return u"". */
4431 if (seqlen == 0) {
4432 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4433 goto Done;
4434 }
4435 /* If singleton sequence with an exact Unicode, return that. */
4436 if (seqlen == 1) {
4437 item = PySequence_Fast_GET_ITEM(fseq, 0);
4438 if (PyUnicode_CheckExact(item)) {
4439 Py_INCREF(item);
4440 res = (PyUnicodeObject *)item;
4441 goto Done;
4442 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004443 }
4444
Tim Peters05eba1f2004-08-27 21:32:02 +00004445 /* At least two items to join, or one that isn't exact Unicode. */
4446 if (seqlen > 1) {
4447 /* Set up sep and seplen -- they're needed. */
4448 if (separator == NULL) {
4449 sep = &blank;
4450 seplen = 1;
4451 }
4452 else {
4453 internal_separator = PyUnicode_FromObject(separator);
4454 if (internal_separator == NULL)
4455 goto onError;
4456 sep = PyUnicode_AS_UNICODE(internal_separator);
4457 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004458 /* In case PyUnicode_FromObject() mutated seq. */
4459 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004460 }
4461 }
4462
4463 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004464 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004465 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004466 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004467 res_p = PyUnicode_AS_UNICODE(res);
4468 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004469
Tim Peters05eba1f2004-08-27 21:32:02 +00004470 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004471 Py_ssize_t itemlen;
4472 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004473
4474 item = PySequence_Fast_GET_ITEM(fseq, i);
4475 /* Convert item to Unicode. */
4476 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4477 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004478 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004479 " %.80s found",
4480 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004481 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004482 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004483 item = PyUnicode_FromObject(item);
4484 if (item == NULL)
4485 goto onError;
4486 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004487
Tim Peters91879ab2004-08-27 22:35:44 +00004488 /* In case PyUnicode_FromObject() mutated seq. */
4489 seqlen = PySequence_Fast_GET_SIZE(fseq);
4490
Tim Peters8ce9f162004-08-27 01:49:32 +00004491 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004493 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004494 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004495 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004496 if (i < seqlen - 1) {
4497 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004498 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004499 goto Overflow;
4500 }
4501 if (new_res_used > res_alloc) {
4502 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004503 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004504 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004505 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004506 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004507 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004508 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004509 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004511 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004512 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004514
4515 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004516 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004517 res_p += itemlen;
4518 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004519 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004520 res_p += seplen;
4521 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004522 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004523 res_used = new_res_used;
4524 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004525
Tim Peters05eba1f2004-08-27 21:32:02 +00004526 /* Shrink res to match the used area; this probably can't fail,
4527 * but it's cheap to check.
4528 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004529 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004530 goto onError;
4531
4532 Done:
4533 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004534 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004535 return (PyObject *)res;
4536
Tim Peters8ce9f162004-08-27 01:49:32 +00004537 Overflow:
4538 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00004539 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004540 Py_DECREF(item);
4541 /* fall through */
4542
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004544 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004545 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004546 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004547 return NULL;
4548}
4549
Tim Petersced69f82003-09-16 20:30:58 +00004550static
4551PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004552 Py_ssize_t left,
4553 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554 Py_UNICODE fill)
4555{
4556 PyUnicodeObject *u;
4557
4558 if (left < 0)
4559 left = 0;
4560 if (right < 0)
4561 right = 0;
4562
Tim Peters7a29bd52001-09-12 03:03:31 +00004563 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004564 Py_INCREF(self);
4565 return self;
4566 }
4567
4568 u = _PyUnicode_New(left + self->length + right);
4569 if (u) {
4570 if (left)
4571 Py_UNICODE_FILL(u->str, fill, left);
4572 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4573 if (right)
4574 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4575 }
4576
4577 return u;
4578}
4579
4580#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004581 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582 if (!str) \
4583 goto onError; \
4584 if (PyList_Append(list, str)) { \
4585 Py_DECREF(str); \
4586 goto onError; \
4587 } \
4588 else \
4589 Py_DECREF(str);
4590
4591static
4592PyObject *split_whitespace(PyUnicodeObject *self,
4593 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004594 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004596 register Py_ssize_t i;
4597 register Py_ssize_t j;
4598 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599 PyObject *str;
4600
4601 for (i = j = 0; i < len; ) {
4602 /* find a token */
4603 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4604 i++;
4605 j = i;
4606 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4607 i++;
4608 if (j < i) {
4609 if (maxcount-- <= 0)
4610 break;
4611 SPLIT_APPEND(self->str, j, i);
4612 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4613 i++;
4614 j = i;
4615 }
4616 }
4617 if (j < len) {
4618 SPLIT_APPEND(self->str, j, len);
4619 }
4620 return list;
4621
4622 onError:
4623 Py_DECREF(list);
4624 return NULL;
4625}
4626
4627PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004628 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004629{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004630 register Py_ssize_t i;
4631 register Py_ssize_t j;
4632 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004633 PyObject *list;
4634 PyObject *str;
4635 Py_UNICODE *data;
4636
4637 string = PyUnicode_FromObject(string);
4638 if (string == NULL)
4639 return NULL;
4640 data = PyUnicode_AS_UNICODE(string);
4641 len = PyUnicode_GET_SIZE(string);
4642
Guido van Rossumd57fd912000-03-10 22:53:23 +00004643 list = PyList_New(0);
4644 if (!list)
4645 goto onError;
4646
4647 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004648 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004649
Guido van Rossumd57fd912000-03-10 22:53:23 +00004650 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004651 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004652 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653
4654 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004655 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004656 if (i < len) {
4657 if (data[i] == '\r' && i + 1 < len &&
4658 data[i+1] == '\n')
4659 i += 2;
4660 else
4661 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004662 if (keepends)
4663 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664 }
Guido van Rossum86662912000-04-11 15:38:46 +00004665 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004666 j = i;
4667 }
4668 if (j < len) {
4669 SPLIT_APPEND(data, j, len);
4670 }
4671
4672 Py_DECREF(string);
4673 return list;
4674
4675 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004676 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004677 Py_DECREF(string);
4678 return NULL;
4679}
4680
Tim Petersced69f82003-09-16 20:30:58 +00004681static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682PyObject *split_char(PyUnicodeObject *self,
4683 PyObject *list,
4684 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004685 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004686{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004687 register Py_ssize_t i;
4688 register Py_ssize_t j;
4689 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004690 PyObject *str;
4691
4692 for (i = j = 0; i < len; ) {
4693 if (self->str[i] == ch) {
4694 if (maxcount-- <= 0)
4695 break;
4696 SPLIT_APPEND(self->str, j, i);
4697 i = j = i + 1;
4698 } else
4699 i++;
4700 }
4701 if (j <= len) {
4702 SPLIT_APPEND(self->str, j, len);
4703 }
4704 return list;
4705
4706 onError:
4707 Py_DECREF(list);
4708 return NULL;
4709}
4710
Tim Petersced69f82003-09-16 20:30:58 +00004711static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712PyObject *split_substring(PyUnicodeObject *self,
4713 PyObject *list,
4714 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004715 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004716{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004717 register Py_ssize_t i;
4718 register Py_ssize_t j;
4719 Py_ssize_t len = self->length;
4720 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721 PyObject *str;
4722
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004723 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724 if (Py_UNICODE_MATCH(self, i, substring)) {
4725 if (maxcount-- <= 0)
4726 break;
4727 SPLIT_APPEND(self->str, j, i);
4728 i = j = i + sublen;
4729 } else
4730 i++;
4731 }
4732 if (j <= len) {
4733 SPLIT_APPEND(self->str, j, len);
4734 }
4735 return list;
4736
4737 onError:
4738 Py_DECREF(list);
4739 return NULL;
4740}
4741
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004742static
4743PyObject *rsplit_whitespace(PyUnicodeObject *self,
4744 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004745 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004746{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004747 register Py_ssize_t i;
4748 register Py_ssize_t j;
4749 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004750 PyObject *str;
4751
4752 for (i = j = len - 1; i >= 0; ) {
4753 /* find a token */
4754 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4755 i--;
4756 j = i;
4757 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4758 i--;
4759 if (j > i) {
4760 if (maxcount-- <= 0)
4761 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004762 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004763 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4764 i--;
4765 j = i;
4766 }
4767 }
4768 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004769 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004770 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004771 if (PyList_Reverse(list) < 0)
4772 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004773 return list;
4774
4775 onError:
4776 Py_DECREF(list);
4777 return NULL;
4778}
4779
4780static
4781PyObject *rsplit_char(PyUnicodeObject *self,
4782 PyObject *list,
4783 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004784 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004785{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004786 register Py_ssize_t i;
4787 register Py_ssize_t j;
4788 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004789 PyObject *str;
4790
4791 for (i = j = len - 1; i >= 0; ) {
4792 if (self->str[i] == ch) {
4793 if (maxcount-- <= 0)
4794 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004795 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004796 j = i = i - 1;
4797 } else
4798 i--;
4799 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004800 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004801 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004802 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004803 if (PyList_Reverse(list) < 0)
4804 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004805 return list;
4806
4807 onError:
4808 Py_DECREF(list);
4809 return NULL;
4810}
4811
4812static
4813PyObject *rsplit_substring(PyUnicodeObject *self,
4814 PyObject *list,
4815 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004816 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004817{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004818 register Py_ssize_t i;
4819 register Py_ssize_t j;
4820 Py_ssize_t len = self->length;
4821 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004822 PyObject *str;
4823
4824 for (i = len - sublen, j = len; i >= 0; ) {
4825 if (Py_UNICODE_MATCH(self, i, substring)) {
4826 if (maxcount-- <= 0)
4827 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004828 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004829 j = i;
4830 i -= sublen;
4831 } else
4832 i--;
4833 }
4834 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004835 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004836 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004837 if (PyList_Reverse(list) < 0)
4838 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004839 return list;
4840
4841 onError:
4842 Py_DECREF(list);
4843 return NULL;
4844}
4845
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846#undef SPLIT_APPEND
4847
4848static
4849PyObject *split(PyUnicodeObject *self,
4850 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004851 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852{
4853 PyObject *list;
4854
4855 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004856 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857
4858 list = PyList_New(0);
4859 if (!list)
4860 return NULL;
4861
4862 if (substring == NULL)
4863 return split_whitespace(self,list,maxcount);
4864
4865 else if (substring->length == 1)
4866 return split_char(self,list,substring->str[0],maxcount);
4867
4868 else if (substring->length == 0) {
4869 Py_DECREF(list);
4870 PyErr_SetString(PyExc_ValueError, "empty separator");
4871 return NULL;
4872 }
4873 else
4874 return split_substring(self,list,substring,maxcount);
4875}
4876
Tim Petersced69f82003-09-16 20:30:58 +00004877static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004878PyObject *rsplit(PyUnicodeObject *self,
4879 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004880 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004881{
4882 PyObject *list;
4883
4884 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004885 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004886
4887 list = PyList_New(0);
4888 if (!list)
4889 return NULL;
4890
4891 if (substring == NULL)
4892 return rsplit_whitespace(self,list,maxcount);
4893
4894 else if (substring->length == 1)
4895 return rsplit_char(self,list,substring->str[0],maxcount);
4896
4897 else if (substring->length == 0) {
4898 Py_DECREF(list);
4899 PyErr_SetString(PyExc_ValueError, "empty separator");
4900 return NULL;
4901 }
4902 else
4903 return rsplit_substring(self,list,substring,maxcount);
4904}
4905
4906static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907PyObject *replace(PyUnicodeObject *self,
4908 PyUnicodeObject *str1,
4909 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004910 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911{
4912 PyUnicodeObject *u;
4913
4914 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004915 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916
Fredrik Lundh347ee272006-05-24 16:35:18 +00004917 if (str1->length == str2->length) {
4918 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004919 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004920 if (str1->length == 1) {
4921 /* replace characters */
4922 Py_UNICODE u1, u2;
4923 if (!findchar(self->str, self->length, str1->str[0]))
4924 goto nothing;
4925 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4926 if (!u)
4927 return NULL;
4928 Py_UNICODE_COPY(u->str, self->str, self->length);
4929 u1 = str1->str[0];
4930 u2 = str2->str[0];
4931 for (i = 0; i < u->length; i++)
4932 if (u->str[i] == u1) {
4933 if (--maxcount < 0)
4934 break;
4935 u->str[i] = u2;
4936 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00004938 i = fastsearch(
4939 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00004941 if (i < 0)
4942 goto nothing;
4943 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4944 if (!u)
4945 return NULL;
4946 Py_UNICODE_COPY(u->str, self->str, self->length);
4947 while (i <= self->length - str1->length)
4948 if (Py_UNICODE_MATCH(self, i, str1)) {
4949 if (--maxcount < 0)
4950 break;
4951 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
4952 i += str1->length;
4953 } else
4954 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00004957
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004958 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00004959 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004960 Py_UNICODE *p;
4961
4962 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004963 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964 if (n > maxcount)
4965 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004966 if (n == 0)
4967 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00004968 /* new_size = self->length + n * (str2->length - str1->length)); */
4969 delta = (str2->length - str1->length);
4970 if (delta == 0) {
4971 new_size = self->length;
4972 } else {
4973 product = n * (str2->length - str1->length);
4974 if ((product / (str2->length - str1->length)) != n) {
4975 PyErr_SetString(PyExc_OverflowError,
4976 "replace string is too long");
4977 return NULL;
4978 }
4979 new_size = self->length + product;
4980 if (new_size < 0) {
4981 PyErr_SetString(PyExc_OverflowError,
4982 "replace string is too long");
4983 return NULL;
4984 }
4985 }
4986 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00004987 if (!u)
4988 return NULL;
4989 i = 0;
4990 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004991 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004992 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004993 while (n-- > 0) {
4994 /* look for next match */
4995 j = i;
4996 while (j <= e) {
4997 if (Py_UNICODE_MATCH(self, j, str1))
4998 break;
4999 j++;
5000 }
5001 if (j > i) {
5002 if (j > e)
5003 break;
5004 /* copy unchanged part [i:j] */
5005 Py_UNICODE_COPY(p, self->str+i, j-i);
5006 p += j - i;
5007 }
5008 /* copy substitution string */
5009 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005010 Py_UNICODE_COPY(p, str2->str, str2->length);
5011 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005012 }
5013 i = j + str1->length;
5014 }
5015 if (i < self->length)
5016 /* copy tail [i:] */
5017 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005018 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005019 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005020 while (n > 0) {
5021 Py_UNICODE_COPY(p, str2->str, str2->length);
5022 p += str2->length;
5023 if (--n <= 0)
5024 break;
5025 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005026 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005027 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028 }
5029 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005031
5032nothing:
5033 /* nothing to replace; return original string (when possible) */
5034 if (PyUnicode_CheckExact(self)) {
5035 Py_INCREF(self);
5036 return (PyObject *) self;
5037 }
5038 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005039}
5040
5041/* --- Unicode Object Methods --------------------------------------------- */
5042
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005043PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044"S.title() -> unicode\n\
5045\n\
5046Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005047characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048
5049static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005050unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005051{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052 return fixup(self, fixtitle);
5053}
5054
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005055PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005056"S.capitalize() -> unicode\n\
5057\n\
5058Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005059have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060
5061static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005062unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064 return fixup(self, fixcapitalize);
5065}
5066
5067#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005068PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069"S.capwords() -> unicode\n\
5070\n\
5071Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005072normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073
5074static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005075unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076{
5077 PyObject *list;
5078 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005079 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081 /* Split into words */
5082 list = split(self, NULL, -1);
5083 if (!list)
5084 return NULL;
5085
5086 /* Capitalize each word */
5087 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5088 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5089 fixcapitalize);
5090 if (item == NULL)
5091 goto onError;
5092 Py_DECREF(PyList_GET_ITEM(list, i));
5093 PyList_SET_ITEM(list, i, item);
5094 }
5095
5096 /* Join the words to form a new string */
5097 item = PyUnicode_Join(NULL, list);
5098
5099onError:
5100 Py_DECREF(list);
5101 return (PyObject *)item;
5102}
5103#endif
5104
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005105/* Argument converter. Coerces to a single unicode character */
5106
5107static int
5108convert_uc(PyObject *obj, void *addr)
5109{
5110 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5111 PyObject *uniobj;
5112 Py_UNICODE *unistr;
5113
5114 uniobj = PyUnicode_FromObject(obj);
5115 if (uniobj == NULL) {
5116 PyErr_SetString(PyExc_TypeError,
5117 "The fill character cannot be converted to Unicode");
5118 return 0;
5119 }
5120 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5121 PyErr_SetString(PyExc_TypeError,
5122 "The fill character must be exactly one character long");
5123 Py_DECREF(uniobj);
5124 return 0;
5125 }
5126 unistr = PyUnicode_AS_UNICODE(uniobj);
5127 *fillcharloc = unistr[0];
5128 Py_DECREF(uniobj);
5129 return 1;
5130}
5131
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005132PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005133"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005135Return S centered in a Unicode string of length width. Padding is\n\
5136done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137
5138static PyObject *
5139unicode_center(PyUnicodeObject *self, PyObject *args)
5140{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005141 Py_ssize_t marg, left;
5142 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005143 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144
Thomas Woutersde017742006-02-16 19:34:37 +00005145 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146 return NULL;
5147
Tim Peters7a29bd52001-09-12 03:03:31 +00005148 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 Py_INCREF(self);
5150 return (PyObject*) self;
5151 }
5152
5153 marg = width - self->length;
5154 left = marg / 2 + (marg & width & 1);
5155
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005156 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157}
5158
Marc-André Lemburge5034372000-08-08 08:04:29 +00005159#if 0
5160
5161/* This code should go into some future Unicode collation support
5162 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005163 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005164
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005165/* speedy UTF-16 code point order comparison */
5166/* gleaned from: */
5167/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5168
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005169static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005170{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005171 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005172 0, 0, 0, 0, 0, 0, 0, 0,
5173 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005174 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005175};
5176
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177static int
5178unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5179{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005180 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005181
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 Py_UNICODE *s1 = str1->str;
5183 Py_UNICODE *s2 = str2->str;
5184
5185 len1 = str1->length;
5186 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005187
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005189 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005190
5191 c1 = *s1++;
5192 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005193
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005194 if (c1 > (1<<11) * 26)
5195 c1 += utf16Fixup[c1>>11];
5196 if (c2 > (1<<11) * 26)
5197 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005198 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005199
5200 if (c1 != c2)
5201 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005202
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005203 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204 }
5205
5206 return (len1 < len2) ? -1 : (len1 != len2);
5207}
5208
Marc-André Lemburge5034372000-08-08 08:04:29 +00005209#else
5210
5211static int
5212unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5213{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005214 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005215
5216 Py_UNICODE *s1 = str1->str;
5217 Py_UNICODE *s2 = str2->str;
5218
5219 len1 = str1->length;
5220 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005221
Marc-André Lemburge5034372000-08-08 08:04:29 +00005222 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005223 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005224
Fredrik Lundh45714e92001-06-26 16:39:36 +00005225 c1 = *s1++;
5226 c2 = *s2++;
5227
5228 if (c1 != c2)
5229 return (c1 < c2) ? -1 : 1;
5230
Marc-André Lemburge5034372000-08-08 08:04:29 +00005231 len1--; len2--;
5232 }
5233
5234 return (len1 < len2) ? -1 : (len1 != len2);
5235}
5236
5237#endif
5238
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239int PyUnicode_Compare(PyObject *left,
5240 PyObject *right)
5241{
5242 PyUnicodeObject *u = NULL, *v = NULL;
5243 int result;
5244
5245 /* Coerce the two arguments */
5246 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5247 if (u == NULL)
5248 goto onError;
5249 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5250 if (v == NULL)
5251 goto onError;
5252
Thomas Wouters7e474022000-07-16 12:04:32 +00005253 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254 if (v == u) {
5255 Py_DECREF(u);
5256 Py_DECREF(v);
5257 return 0;
5258 }
5259
5260 result = unicode_compare(u, v);
5261
5262 Py_DECREF(u);
5263 Py_DECREF(v);
5264 return result;
5265
5266onError:
5267 Py_XDECREF(u);
5268 Py_XDECREF(v);
5269 return -1;
5270}
5271
Guido van Rossum403d68b2000-03-13 15:55:09 +00005272int PyUnicode_Contains(PyObject *container,
5273 PyObject *element)
5274{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005275 PyObject *str, *sub;
5276 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005277
5278 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005279 sub = PyUnicode_FromObject(element);
5280 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005281 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005282 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005283 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005284 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005285
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005286 str = PyUnicode_FromObject(container);
5287 if (!str) {
5288 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005289 return -1;
5290 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005291
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005292 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00005293
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005294 Py_DECREF(str);
5295 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00005296
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005297 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005298}
5299
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300/* Concat to string or Unicode object giving a new Unicode object. */
5301
5302PyObject *PyUnicode_Concat(PyObject *left,
5303 PyObject *right)
5304{
5305 PyUnicodeObject *u = NULL, *v = NULL, *w;
5306
5307 /* Coerce the two arguments */
5308 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5309 if (u == NULL)
5310 goto onError;
5311 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5312 if (v == NULL)
5313 goto onError;
5314
5315 /* Shortcuts */
5316 if (v == unicode_empty) {
5317 Py_DECREF(v);
5318 return (PyObject *)u;
5319 }
5320 if (u == unicode_empty) {
5321 Py_DECREF(u);
5322 return (PyObject *)v;
5323 }
5324
5325 /* Concat the two Unicode strings */
5326 w = _PyUnicode_New(u->length + v->length);
5327 if (w == NULL)
5328 goto onError;
5329 Py_UNICODE_COPY(w->str, u->str, u->length);
5330 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5331
5332 Py_DECREF(u);
5333 Py_DECREF(v);
5334 return (PyObject *)w;
5335
5336onError:
5337 Py_XDECREF(u);
5338 Py_XDECREF(v);
5339 return NULL;
5340}
5341
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005342PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343"S.count(sub[, start[, end]]) -> int\n\
5344\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005345Return the number of non-overlapping occurrences of substring sub in\n\
5346Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005347interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348
5349static PyObject *
5350unicode_count(PyUnicodeObject *self, PyObject *args)
5351{
5352 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005353 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005354 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 PyObject *result;
5356
Guido van Rossumb8872e62000-05-09 14:14:27 +00005357 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5358 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 return NULL;
5360
5361 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005362 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363 if (substring == NULL)
5364 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005365
Fredrik Lundhc8162812006-05-26 19:33:03 +00005366 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005368 result = PyInt_FromSsize_t(
5369 stringlib_count(self->str + start, end - start,
5370 substring->str, substring->length)
5371 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372
5373 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005374
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 return result;
5376}
5377
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005378PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005379"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005381Encodes S using the codec registered for encoding. encoding defaults\n\
5382to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005383handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005384a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5385'xmlcharrefreplace' as well as any other name registered with\n\
5386codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387
5388static PyObject *
5389unicode_encode(PyUnicodeObject *self, PyObject *args)
5390{
5391 char *encoding = NULL;
5392 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005393 PyObject *v;
5394
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5396 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005397 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005398 if (v == NULL)
5399 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005400 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5401 PyErr_Format(PyExc_TypeError,
5402 "encoder did not return a string/unicode object "
5403 "(type=%.400s)",
5404 v->ob_type->tp_name);
5405 Py_DECREF(v);
5406 return NULL;
5407 }
5408 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005409
5410 onError:
5411 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005412}
5413
5414PyDoc_STRVAR(decode__doc__,
5415"S.decode([encoding[,errors]]) -> string or unicode\n\
5416\n\
5417Decodes S using the codec registered for encoding. encoding defaults\n\
5418to the default encoding. errors may be given to set a different error\n\
5419handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5420a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5421as well as any other name registerd with codecs.register_error that is\n\
5422able to handle UnicodeDecodeErrors.");
5423
5424static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005425unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005426{
5427 char *encoding = NULL;
5428 char *errors = NULL;
5429 PyObject *v;
5430
5431 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5432 return NULL;
5433 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005434 if (v == NULL)
5435 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005436 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5437 PyErr_Format(PyExc_TypeError,
5438 "decoder did not return a string/unicode object "
5439 "(type=%.400s)",
5440 v->ob_type->tp_name);
5441 Py_DECREF(v);
5442 return NULL;
5443 }
5444 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005445
5446 onError:
5447 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448}
5449
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005450PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451"S.expandtabs([tabsize]) -> unicode\n\
5452\n\
5453Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005454If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455
5456static PyObject*
5457unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5458{
5459 Py_UNICODE *e;
5460 Py_UNICODE *p;
5461 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005462 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463 PyUnicodeObject *u;
5464 int tabsize = 8;
5465
5466 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5467 return NULL;
5468
Thomas Wouters7e474022000-07-16 12:04:32 +00005469 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470 i = j = 0;
5471 e = self->str + self->length;
5472 for (p = self->str; p < e; p++)
5473 if (*p == '\t') {
5474 if (tabsize > 0)
5475 j += tabsize - (j % tabsize);
5476 }
5477 else {
5478 j++;
5479 if (*p == '\n' || *p == '\r') {
5480 i += j;
5481 j = 0;
5482 }
5483 }
5484
5485 /* Second pass: create output string and fill it */
5486 u = _PyUnicode_New(i + j);
5487 if (!u)
5488 return NULL;
5489
5490 j = 0;
5491 q = u->str;
5492
5493 for (p = self->str; p < e; p++)
5494 if (*p == '\t') {
5495 if (tabsize > 0) {
5496 i = tabsize - (j % tabsize);
5497 j += i;
5498 while (i--)
5499 *q++ = ' ';
5500 }
5501 }
5502 else {
5503 j++;
5504 *q++ = *p;
5505 if (*p == '\n' || *p == '\r')
5506 j = 0;
5507 }
5508
5509 return (PyObject*) u;
5510}
5511
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005512PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513"S.find(sub [,start [,end]]) -> int\n\
5514\n\
5515Return the lowest index in S where substring sub is found,\n\
5516such that sub is contained within s[start,end]. Optional\n\
5517arguments start and end are interpreted as in slice notation.\n\
5518\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005519Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520
5521static PyObject *
5522unicode_find(PyUnicodeObject *self, PyObject *args)
5523{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005524 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005525 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005526 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005527 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528
Guido van Rossumb8872e62000-05-09 14:14:27 +00005529 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5530 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005532 substring = PyUnicode_FromObject(substring);
5533 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534 return NULL;
5535
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005536 result = stringlib_find_slice(
5537 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5538 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5539 start, end
5540 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541
5542 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005543
5544 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545}
5546
5547static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005548unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549{
5550 if (index < 0 || index >= self->length) {
5551 PyErr_SetString(PyExc_IndexError, "string index out of range");
5552 return NULL;
5553 }
5554
5555 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5556}
5557
5558static long
5559unicode_hash(PyUnicodeObject *self)
5560{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005561 /* Since Unicode objects compare equal to their ASCII string
5562 counterparts, they should use the individual character values
5563 as basis for their hash value. This is needed to assure that
5564 strings and Unicode objects behave in the same way as
5565 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566
Martin v. Löwis18e16552006-02-15 17:27:45 +00005567 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005568 register Py_UNICODE *p;
5569 register long x;
5570
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 if (self->hash != -1)
5572 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005573 len = PyUnicode_GET_SIZE(self);
5574 p = PyUnicode_AS_UNICODE(self);
5575 x = *p << 7;
5576 while (--len >= 0)
5577 x = (1000003*x) ^ *p++;
5578 x ^= PyUnicode_GET_SIZE(self);
5579 if (x == -1)
5580 x = -2;
5581 self->hash = x;
5582 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583}
5584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005585PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586"S.index(sub [,start [,end]]) -> int\n\
5587\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005588Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589
5590static PyObject *
5591unicode_index(PyUnicodeObject *self, PyObject *args)
5592{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005593 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005594 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005595 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005596 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597
Guido van Rossumb8872e62000-05-09 14:14:27 +00005598 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5599 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005601 substring = PyUnicode_FromObject(substring);
5602 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 return NULL;
5604
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005605 result = stringlib_find_slice(
5606 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5607 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5608 start, end
5609 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610
5611 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005612
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 if (result < 0) {
5614 PyErr_SetString(PyExc_ValueError, "substring not found");
5615 return NULL;
5616 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005617
Martin v. Löwis18e16552006-02-15 17:27:45 +00005618 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619}
5620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005621PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005622"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005624Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005625at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626
5627static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005628unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629{
5630 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5631 register const Py_UNICODE *e;
5632 int cased;
5633
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 /* Shortcut for single character strings */
5635 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005636 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005638 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005639 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005640 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005641
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 e = p + PyUnicode_GET_SIZE(self);
5643 cased = 0;
5644 for (; p < e; p++) {
5645 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005646
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005648 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 else if (!cased && Py_UNICODE_ISLOWER(ch))
5650 cased = 1;
5651 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005652 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653}
5654
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005655PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005656"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005658Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005659at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660
5661static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005662unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663{
5664 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5665 register const Py_UNICODE *e;
5666 int cased;
5667
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668 /* Shortcut for single character strings */
5669 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005670 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005672 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005673 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005674 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005675
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 e = p + PyUnicode_GET_SIZE(self);
5677 cased = 0;
5678 for (; p < e; p++) {
5679 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005680
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005682 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 else if (!cased && Py_UNICODE_ISUPPER(ch))
5684 cased = 1;
5685 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005686 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687}
5688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005689PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005690"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005692Return True if S is a titlecased string and there is at least one\n\
5693character in S, i.e. upper- and titlecase characters may only\n\
5694follow uncased characters and lowercase characters only cased ones.\n\
5695Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696
5697static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005698unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699{
5700 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5701 register const Py_UNICODE *e;
5702 int cased, previous_is_cased;
5703
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 /* Shortcut for single character strings */
5705 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005706 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5707 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005709 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005710 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005711 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005712
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 e = p + PyUnicode_GET_SIZE(self);
5714 cased = 0;
5715 previous_is_cased = 0;
5716 for (; p < e; p++) {
5717 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005718
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5720 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005721 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722 previous_is_cased = 1;
5723 cased = 1;
5724 }
5725 else if (Py_UNICODE_ISLOWER(ch)) {
5726 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005727 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 previous_is_cased = 1;
5729 cased = 1;
5730 }
5731 else
5732 previous_is_cased = 0;
5733 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005734 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735}
5736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005737PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005738"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005740Return True if all characters in S are whitespace\n\
5741and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742
5743static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005744unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745{
5746 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5747 register const Py_UNICODE *e;
5748
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749 /* Shortcut for single character strings */
5750 if (PyUnicode_GET_SIZE(self) == 1 &&
5751 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005752 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005754 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005755 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005756 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005757
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 e = p + PyUnicode_GET_SIZE(self);
5759 for (; p < e; p++) {
5760 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005761 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005763 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764}
5765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005766PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005767"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005768\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005769Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005770and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005771
5772static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005773unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005774{
5775 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5776 register const Py_UNICODE *e;
5777
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005778 /* Shortcut for single character strings */
5779 if (PyUnicode_GET_SIZE(self) == 1 &&
5780 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005781 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005782
5783 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005784 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005785 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005786
5787 e = p + PyUnicode_GET_SIZE(self);
5788 for (; p < e; p++) {
5789 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005790 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005791 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005792 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005793}
5794
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005795PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005796"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005797\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005798Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005799and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005800
5801static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005802unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005803{
5804 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5805 register const Py_UNICODE *e;
5806
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005807 /* Shortcut for single character strings */
5808 if (PyUnicode_GET_SIZE(self) == 1 &&
5809 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005810 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005811
5812 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005813 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005814 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005815
5816 e = p + PyUnicode_GET_SIZE(self);
5817 for (; p < e; p++) {
5818 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005819 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005820 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005821 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005822}
5823
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005824PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005825"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005827Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005828False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829
5830static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005831unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832{
5833 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5834 register const Py_UNICODE *e;
5835
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 /* Shortcut for single character strings */
5837 if (PyUnicode_GET_SIZE(self) == 1 &&
5838 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005839 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005841 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005842 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005843 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005844
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 e = p + PyUnicode_GET_SIZE(self);
5846 for (; p < e; p++) {
5847 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005848 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005850 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851}
5852
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005853PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005854"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005856Return True if all characters in S are digits\n\
5857and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858
5859static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005860unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861{
5862 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5863 register const Py_UNICODE *e;
5864
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 /* Shortcut for single character strings */
5866 if (PyUnicode_GET_SIZE(self) == 1 &&
5867 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005868 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005870 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005871 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005872 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005873
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 e = p + PyUnicode_GET_SIZE(self);
5875 for (; p < e; p++) {
5876 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005877 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005879 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880}
5881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005882PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005883"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005885Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005886False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887
5888static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005889unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890{
5891 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5892 register const Py_UNICODE *e;
5893
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894 /* Shortcut for single character strings */
5895 if (PyUnicode_GET_SIZE(self) == 1 &&
5896 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005897 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005899 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005900 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005901 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005902
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 e = p + PyUnicode_GET_SIZE(self);
5904 for (; p < e; p++) {
5905 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005906 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005908 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909}
5910
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005911PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912"S.join(sequence) -> unicode\n\
5913\n\
5914Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005915sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916
5917static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005918unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005920 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921}
5922
Martin v. Löwis18e16552006-02-15 17:27:45 +00005923static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924unicode_length(PyUnicodeObject *self)
5925{
5926 return self->length;
5927}
5928
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005929PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005930"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931\n\
5932Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005933done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934
5935static PyObject *
5936unicode_ljust(PyUnicodeObject *self, PyObject *args)
5937{
Martin v. Löwis412fb672006-04-13 06:34:32 +00005938 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005939 Py_UNICODE fillchar = ' ';
5940
Martin v. Löwis412fb672006-04-13 06:34:32 +00005941 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 return NULL;
5943
Tim Peters7a29bd52001-09-12 03:03:31 +00005944 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 Py_INCREF(self);
5946 return (PyObject*) self;
5947 }
5948
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005949 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950}
5951
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005952PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953"S.lower() -> unicode\n\
5954\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005955Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956
5957static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005958unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 return fixup(self, fixlower);
5961}
5962
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005963#define LEFTSTRIP 0
5964#define RIGHTSTRIP 1
5965#define BOTHSTRIP 2
5966
5967/* Arrays indexed by above */
5968static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5969
5970#define STRIPNAME(i) (stripformat[i]+3)
5971
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005972/* externally visible for str.strip(unicode) */
5973PyObject *
5974_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5975{
5976 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005977 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005978 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005979 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
5980 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005981
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005982 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
5983
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005984 i = 0;
5985 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005986 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
5987 i++;
5988 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005989 }
5990
5991 j = len;
5992 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005993 do {
5994 j--;
5995 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
5996 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005997 }
5998
5999 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006000 Py_INCREF(self);
6001 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006002 }
6003 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006004 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006005}
6006
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007
6008static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006009do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006011 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006012 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006013
6014 i = 0;
6015 if (striptype != RIGHTSTRIP) {
6016 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6017 i++;
6018 }
6019 }
6020
6021 j = len;
6022 if (striptype != LEFTSTRIP) {
6023 do {
6024 j--;
6025 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6026 j++;
6027 }
6028
6029 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6030 Py_INCREF(self);
6031 return (PyObject*)self;
6032 }
6033 else
6034 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035}
6036
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006037
6038static PyObject *
6039do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6040{
6041 PyObject *sep = NULL;
6042
6043 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6044 return NULL;
6045
6046 if (sep != NULL && sep != Py_None) {
6047 if (PyUnicode_Check(sep))
6048 return _PyUnicode_XStrip(self, striptype, sep);
6049 else if (PyString_Check(sep)) {
6050 PyObject *res;
6051 sep = PyUnicode_FromObject(sep);
6052 if (sep==NULL)
6053 return NULL;
6054 res = _PyUnicode_XStrip(self, striptype, sep);
6055 Py_DECREF(sep);
6056 return res;
6057 }
6058 else {
6059 PyErr_Format(PyExc_TypeError,
6060 "%s arg must be None, unicode or str",
6061 STRIPNAME(striptype));
6062 return NULL;
6063 }
6064 }
6065
6066 return do_strip(self, striptype);
6067}
6068
6069
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006070PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006071"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006072\n\
6073Return a copy of the string S with leading and trailing\n\
6074whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006075If chars is given and not None, remove characters in chars instead.\n\
6076If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006077
6078static PyObject *
6079unicode_strip(PyUnicodeObject *self, PyObject *args)
6080{
6081 if (PyTuple_GET_SIZE(args) == 0)
6082 return do_strip(self, BOTHSTRIP); /* Common case */
6083 else
6084 return do_argstrip(self, BOTHSTRIP, args);
6085}
6086
6087
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006088PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006089"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006090\n\
6091Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006092If chars is given and not None, remove characters in chars instead.\n\
6093If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006094
6095static PyObject *
6096unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6097{
6098 if (PyTuple_GET_SIZE(args) == 0)
6099 return do_strip(self, LEFTSTRIP); /* Common case */
6100 else
6101 return do_argstrip(self, LEFTSTRIP, args);
6102}
6103
6104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006105PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006106"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006107\n\
6108Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006109If chars is given and not None, remove characters in chars instead.\n\
6110If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006111
6112static PyObject *
6113unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6114{
6115 if (PyTuple_GET_SIZE(args) == 0)
6116 return do_strip(self, RIGHTSTRIP); /* Common case */
6117 else
6118 return do_argstrip(self, RIGHTSTRIP, args);
6119}
6120
6121
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006123unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124{
6125 PyUnicodeObject *u;
6126 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006127 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006128 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129
6130 if (len < 0)
6131 len = 0;
6132
Tim Peters7a29bd52001-09-12 03:03:31 +00006133 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 /* no repeat, return original string */
6135 Py_INCREF(str);
6136 return (PyObject*) str;
6137 }
Tim Peters8f422462000-09-09 06:13:41 +00006138
6139 /* ensure # of chars needed doesn't overflow int and # of bytes
6140 * needed doesn't overflow size_t
6141 */
6142 nchars = len * str->length;
6143 if (len && nchars / len != str->length) {
6144 PyErr_SetString(PyExc_OverflowError,
6145 "repeated string is too long");
6146 return NULL;
6147 }
6148 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6149 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6150 PyErr_SetString(PyExc_OverflowError,
6151 "repeated string is too long");
6152 return NULL;
6153 }
6154 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155 if (!u)
6156 return NULL;
6157
6158 p = u->str;
6159
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006160 if (str->length == 1 && len > 0) {
6161 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006162 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006163 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006164 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006165 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006166 done = str->length;
6167 }
6168 while (done < nchars) {
6169 int n = (done <= nchars-done) ? done : nchars-done;
6170 Py_UNICODE_COPY(p+done, p, n);
6171 done += n;
6172 }
6173 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174
6175 return (PyObject*) u;
6176}
6177
6178PyObject *PyUnicode_Replace(PyObject *obj,
6179 PyObject *subobj,
6180 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006181 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182{
6183 PyObject *self;
6184 PyObject *str1;
6185 PyObject *str2;
6186 PyObject *result;
6187
6188 self = PyUnicode_FromObject(obj);
6189 if (self == NULL)
6190 return NULL;
6191 str1 = PyUnicode_FromObject(subobj);
6192 if (str1 == NULL) {
6193 Py_DECREF(self);
6194 return NULL;
6195 }
6196 str2 = PyUnicode_FromObject(replobj);
6197 if (str2 == NULL) {
6198 Py_DECREF(self);
6199 Py_DECREF(str1);
6200 return NULL;
6201 }
Tim Petersced69f82003-09-16 20:30:58 +00006202 result = replace((PyUnicodeObject *)self,
6203 (PyUnicodeObject *)str1,
6204 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 maxcount);
6206 Py_DECREF(self);
6207 Py_DECREF(str1);
6208 Py_DECREF(str2);
6209 return result;
6210}
6211
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006212PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213"S.replace (old, new[, maxsplit]) -> unicode\n\
6214\n\
6215Return a copy of S with all occurrences of substring\n\
6216old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006217given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218
6219static PyObject*
6220unicode_replace(PyUnicodeObject *self, PyObject *args)
6221{
6222 PyUnicodeObject *str1;
6223 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006224 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 PyObject *result;
6226
Martin v. Löwis18e16552006-02-15 17:27:45 +00006227 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 return NULL;
6229 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6230 if (str1 == NULL)
6231 return NULL;
6232 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006233 if (str2 == NULL) {
6234 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237
6238 result = replace(self, str1, str2, maxcount);
6239
6240 Py_DECREF(str1);
6241 Py_DECREF(str2);
6242 return result;
6243}
6244
6245static
6246PyObject *unicode_repr(PyObject *unicode)
6247{
6248 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6249 PyUnicode_GET_SIZE(unicode),
6250 1);
6251}
6252
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006253PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254"S.rfind(sub [,start [,end]]) -> int\n\
6255\n\
6256Return the highest index in S where substring sub is found,\n\
6257such that sub is contained within s[start,end]. Optional\n\
6258arguments start and end are interpreted as in slice notation.\n\
6259\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006260Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261
6262static PyObject *
6263unicode_rfind(PyUnicodeObject *self, PyObject *args)
6264{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006265 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006266 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006267 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006268 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269
Guido van Rossumb8872e62000-05-09 14:14:27 +00006270 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6271 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006273 substring = PyUnicode_FromObject(substring);
6274 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 return NULL;
6276
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006277 result = stringlib_rfind_slice(
6278 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6279 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6280 start, end
6281 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282
6283 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006284
6285 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286}
6287
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006288PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289"S.rindex(sub [,start [,end]]) -> int\n\
6290\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006291Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292
6293static PyObject *
6294unicode_rindex(PyUnicodeObject *self, PyObject *args)
6295{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006296 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006297 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006298 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006299 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300
Guido van Rossumb8872e62000-05-09 14:14:27 +00006301 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6302 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006304 substring = PyUnicode_FromObject(substring);
6305 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 return NULL;
6307
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006308 result = stringlib_rfind_slice(
6309 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6310 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6311 start, end
6312 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313
6314 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006315
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 if (result < 0) {
6317 PyErr_SetString(PyExc_ValueError, "substring not found");
6318 return NULL;
6319 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006320 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321}
6322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006323PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006324"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325\n\
6326Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006327done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328
6329static PyObject *
6330unicode_rjust(PyUnicodeObject *self, PyObject *args)
6331{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006332 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006333 Py_UNICODE fillchar = ' ';
6334
Martin v. Löwis412fb672006-04-13 06:34:32 +00006335 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336 return NULL;
6337
Tim Peters7a29bd52001-09-12 03:03:31 +00006338 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339 Py_INCREF(self);
6340 return (PyObject*) self;
6341 }
6342
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006343 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344}
6345
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006347unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348{
6349 /* standard clamping */
6350 if (start < 0)
6351 start = 0;
6352 if (end < 0)
6353 end = 0;
6354 if (end > self->length)
6355 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006356 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 /* full slice, return original string */
6358 Py_INCREF(self);
6359 return (PyObject*) self;
6360 }
6361 if (start > end)
6362 start = end;
6363 /* copy slice */
6364 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6365 end - start);
6366}
6367
6368PyObject *PyUnicode_Split(PyObject *s,
6369 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006370 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371{
6372 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006373
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374 s = PyUnicode_FromObject(s);
6375 if (s == NULL)
6376 return NULL;
6377 if (sep != NULL) {
6378 sep = PyUnicode_FromObject(sep);
6379 if (sep == NULL) {
6380 Py_DECREF(s);
6381 return NULL;
6382 }
6383 }
6384
6385 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6386
6387 Py_DECREF(s);
6388 Py_XDECREF(sep);
6389 return result;
6390}
6391
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006392PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393"S.split([sep [,maxsplit]]) -> list of strings\n\
6394\n\
6395Return a list of the words in S, using sep as the\n\
6396delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006397splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006398any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399
6400static PyObject*
6401unicode_split(PyUnicodeObject *self, PyObject *args)
6402{
6403 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006404 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405
Martin v. Löwis18e16552006-02-15 17:27:45 +00006406 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407 return NULL;
6408
6409 if (substring == Py_None)
6410 return split(self, NULL, maxcount);
6411 else if (PyUnicode_Check(substring))
6412 return split(self, (PyUnicodeObject *)substring, maxcount);
6413 else
6414 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6415}
6416
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006417PyObject *
6418PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6419{
6420 PyObject* str_obj;
6421 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006422 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00006423
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006424 str_obj = PyUnicode_FromObject(str_in);
6425 if (!str_obj)
6426 return NULL;
6427 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00006428 if (!sep_obj) {
6429 Py_DECREF(str_obj);
6430 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006431 }
6432
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006433 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00006434 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6435 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6436 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006437
Fredrik Lundhb9479482006-05-26 17:22:38 +00006438 Py_DECREF(sep_obj);
6439 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006440
6441 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006442}
6443
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006444
6445PyObject *
6446PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6447{
6448 PyObject* str_obj;
6449 PyObject* sep_obj;
6450 PyObject* out;
6451
6452 str_obj = PyUnicode_FromObject(str_in);
6453 if (!str_obj)
6454 return NULL;
6455 sep_obj = PyUnicode_FromObject(sep_in);
6456 if (!sep_obj) {
6457 Py_DECREF(str_obj);
6458 return NULL;
6459 }
6460
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006461 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006462 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6463 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6464 );
6465
6466 Py_DECREF(sep_obj);
6467 Py_DECREF(str_obj);
6468
6469 return out;
6470}
6471
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006472PyDoc_STRVAR(partition__doc__,
6473"S.partition(sep) -> (head, sep, tail)\n\
6474\n\
6475Searches for the separator sep in S, and returns the part before it,\n\
6476the separator itself, and the part after it. If the separator is not\n\
6477found, returns S and two empty strings.");
6478
6479static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00006480unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006481{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006482 return PyUnicode_Partition((PyObject *)self, separator);
6483}
6484
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006485PyDoc_STRVAR(rpartition__doc__,
6486"S.rpartition(sep) -> (head, sep, tail)\n\
6487\n\
6488Searches for the separator sep in S, starting at the end of S, and returns\n\
6489the part before it, the separator itself, and the part after it. If the\n\
6490separator is not found, returns S and two empty strings.");
6491
6492static PyObject*
6493unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6494{
6495 return PyUnicode_RPartition((PyObject *)self, separator);
6496}
6497
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006498PyObject *PyUnicode_RSplit(PyObject *s,
6499 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006500 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006501{
6502 PyObject *result;
6503
6504 s = PyUnicode_FromObject(s);
6505 if (s == NULL)
6506 return NULL;
6507 if (sep != NULL) {
6508 sep = PyUnicode_FromObject(sep);
6509 if (sep == NULL) {
6510 Py_DECREF(s);
6511 return NULL;
6512 }
6513 }
6514
6515 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6516
6517 Py_DECREF(s);
6518 Py_XDECREF(sep);
6519 return result;
6520}
6521
6522PyDoc_STRVAR(rsplit__doc__,
6523"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6524\n\
6525Return a list of the words in S, using sep as the\n\
6526delimiter string, starting at the end of the string and\n\
6527working to the front. If maxsplit is given, at most maxsplit\n\
6528splits are done. If sep is not specified, any whitespace string\n\
6529is a separator.");
6530
6531static PyObject*
6532unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6533{
6534 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006535 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006536
Martin v. Löwis18e16552006-02-15 17:27:45 +00006537 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006538 return NULL;
6539
6540 if (substring == Py_None)
6541 return rsplit(self, NULL, maxcount);
6542 else if (PyUnicode_Check(substring))
6543 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6544 else
6545 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6546}
6547
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006548PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006549"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550\n\
6551Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006552Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006553is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554
6555static PyObject*
6556unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6557{
Guido van Rossum86662912000-04-11 15:38:46 +00006558 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559
Guido van Rossum86662912000-04-11 15:38:46 +00006560 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 return NULL;
6562
Guido van Rossum86662912000-04-11 15:38:46 +00006563 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564}
6565
6566static
6567PyObject *unicode_str(PyUnicodeObject *self)
6568{
Fred Drakee4315f52000-05-09 19:53:39 +00006569 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570}
6571
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006572PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573"S.swapcase() -> unicode\n\
6574\n\
6575Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006576and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577
6578static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006579unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 return fixup(self, fixswapcase);
6582}
6583
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006584PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585"S.translate(table) -> unicode\n\
6586\n\
6587Return a copy of the string S, where all characters have been mapped\n\
6588through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006589Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6590Unmapped characters are left untouched. Characters mapped to None\n\
6591are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592
6593static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006594unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595{
Tim Petersced69f82003-09-16 20:30:58 +00006596 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006598 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 "ignore");
6600}
6601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006602PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603"S.upper() -> unicode\n\
6604\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006605Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606
6607static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006608unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 return fixup(self, fixupper);
6611}
6612
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006613PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614"S.zfill(width) -> unicode\n\
6615\n\
6616Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006617of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618
6619static PyObject *
6620unicode_zfill(PyUnicodeObject *self, PyObject *args)
6621{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006622 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623 PyUnicodeObject *u;
6624
Martin v. Löwis18e16552006-02-15 17:27:45 +00006625 Py_ssize_t width;
6626 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 return NULL;
6628
6629 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006630 if (PyUnicode_CheckExact(self)) {
6631 Py_INCREF(self);
6632 return (PyObject*) self;
6633 }
6634 else
6635 return PyUnicode_FromUnicode(
6636 PyUnicode_AS_UNICODE(self),
6637 PyUnicode_GET_SIZE(self)
6638 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639 }
6640
6641 fill = width - self->length;
6642
6643 u = pad(self, fill, 0, '0');
6644
Walter Dörwald068325e2002-04-15 13:36:47 +00006645 if (u == NULL)
6646 return NULL;
6647
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 if (u->str[fill] == '+' || u->str[fill] == '-') {
6649 /* move sign to beginning of string */
6650 u->str[0] = u->str[fill];
6651 u->str[fill] = '0';
6652 }
6653
6654 return (PyObject*) u;
6655}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656
6657#if 0
6658static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006659unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 return PyInt_FromLong(unicode_freelist_size);
6662}
6663#endif
6664
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006665PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006666"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006668Return True if S starts with the specified prefix, False otherwise.\n\
6669With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00006670With optional end, stop comparing S at that position.\n\
6671prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672
6673static PyObject *
6674unicode_startswith(PyUnicodeObject *self,
6675 PyObject *args)
6676{
Georg Brandl24250812006-06-09 18:45:48 +00006677 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006679 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006680 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00006681 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682
Georg Brandl24250812006-06-09 18:45:48 +00006683 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00006684 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00006686 if (PyTuple_Check(subobj)) {
6687 Py_ssize_t i;
6688 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6689 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6690 PyTuple_GET_ITEM(subobj, i));
6691 if (substring == NULL)
6692 return NULL;
6693 result = tailmatch(self, substring, start, end, -1);
6694 Py_DECREF(substring);
6695 if (result) {
6696 Py_RETURN_TRUE;
6697 }
6698 }
6699 /* nothing matched */
6700 Py_RETURN_FALSE;
6701 }
6702 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00006704 return NULL;
6705 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00006707 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708}
6709
6710
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006711PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006712"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006714Return True if S ends with the specified suffix, False otherwise.\n\
6715With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00006716With optional end, stop comparing S at that position.\n\
6717suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718
6719static PyObject *
6720unicode_endswith(PyUnicodeObject *self,
6721 PyObject *args)
6722{
Georg Brandl24250812006-06-09 18:45:48 +00006723 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006725 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006726 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00006727 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728
Georg Brandl24250812006-06-09 18:45:48 +00006729 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
6730 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00006732 if (PyTuple_Check(subobj)) {
6733 Py_ssize_t i;
6734 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6735 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6736 PyTuple_GET_ITEM(subobj, i));
6737 if (substring == NULL)
6738 return NULL;
6739 result = tailmatch(self, substring, start, end, +1);
6740 Py_DECREF(substring);
6741 if (result) {
6742 Py_RETURN_TRUE;
6743 }
6744 }
6745 Py_RETURN_FALSE;
6746 }
6747 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00006749 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750
Georg Brandl24250812006-06-09 18:45:48 +00006751 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00006753 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754}
6755
6756
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006757
6758static PyObject *
6759unicode_getnewargs(PyUnicodeObject *v)
6760{
6761 return Py_BuildValue("(u#)", v->str, v->length);
6762}
6763
6764
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765static PyMethodDef unicode_methods[] = {
6766
6767 /* Order is according to common usage: often used methods should
6768 appear first, since lookup is done sequentially. */
6769
Georg Brandlecdc0a92006-03-30 12:19:07 +00006770 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006771 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6772 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006773 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006774 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6775 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6776 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6777 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6778 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6779 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6780 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00006781 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006782 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6783 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6784 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006785 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006786 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006787/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6788 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6789 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6790 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006791 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006792 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006793 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006794 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006795 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6796 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6797 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6798 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6799 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6800 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6801 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6802 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6803 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6804 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6805 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6806 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6807 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6808 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006809 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006810#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006811 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812#endif
6813
6814#if 0
6815 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006816 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817#endif
6818
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006819 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 {NULL, NULL}
6821};
6822
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006823static PyObject *
6824unicode_mod(PyObject *v, PyObject *w)
6825{
6826 if (!PyUnicode_Check(v)) {
6827 Py_INCREF(Py_NotImplemented);
6828 return Py_NotImplemented;
6829 }
6830 return PyUnicode_Format(v, w);
6831}
6832
6833static PyNumberMethods unicode_as_number = {
6834 0, /*nb_add*/
6835 0, /*nb_subtract*/
6836 0, /*nb_multiply*/
6837 0, /*nb_divide*/
6838 unicode_mod, /*nb_remainder*/
6839};
6840
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006842 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00006843 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006844 (ssizeargfunc) unicode_repeat, /* sq_repeat */
6845 (ssizeargfunc) unicode_getitem, /* sq_item */
6846 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847 0, /* sq_ass_item */
6848 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00006849 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850};
6851
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006852#define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
6853
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006854static PyObject*
6855unicode_subscript(PyUnicodeObject* self, PyObject* item)
6856{
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006857 PyNumberMethods *nb = item->ob_type->tp_as_number;
6858 if (nb != NULL && HASINDEX(item) && nb->nb_index != NULL) {
6859 Py_ssize_t i = nb->nb_index(item);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006860 if (i == -1 && PyErr_Occurred())
6861 return NULL;
6862 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006863 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006864 return unicode_getitem(self, i);
6865 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006866 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006867 Py_UNICODE* source_buf;
6868 Py_UNICODE* result_buf;
6869 PyObject* result;
6870
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006871 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006872 &start, &stop, &step, &slicelength) < 0) {
6873 return NULL;
6874 }
6875
6876 if (slicelength <= 0) {
6877 return PyUnicode_FromUnicode(NULL, 0);
6878 } else {
6879 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00006880 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
6881 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006882
6883 if (result_buf == NULL)
6884 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006885
6886 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6887 result_buf[i] = source_buf[cur];
6888 }
Tim Petersced69f82003-09-16 20:30:58 +00006889
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006890 result = PyUnicode_FromUnicode(result_buf, slicelength);
6891 PyMem_FREE(result_buf);
6892 return result;
6893 }
6894 } else {
6895 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6896 return NULL;
6897 }
6898}
6899
6900static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006901 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006902 (binaryfunc)unicode_subscript, /* mp_subscript */
6903 (objobjargproc)0, /* mp_ass_subscript */
6904};
6905
Martin v. Löwis18e16552006-02-15 17:27:45 +00006906static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006908 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 const void **ptr)
6910{
6911 if (index != 0) {
6912 PyErr_SetString(PyExc_SystemError,
6913 "accessing non-existent unicode segment");
6914 return -1;
6915 }
6916 *ptr = (void *) self->str;
6917 return PyUnicode_GET_DATA_SIZE(self);
6918}
6919
Martin v. Löwis18e16552006-02-15 17:27:45 +00006920static Py_ssize_t
6921unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922 const void **ptr)
6923{
6924 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006925 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 return -1;
6927}
6928
6929static int
6930unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006931 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932{
6933 if (lenp)
6934 *lenp = PyUnicode_GET_DATA_SIZE(self);
6935 return 1;
6936}
6937
Martin v. Löwiseb079f12006-02-16 14:32:27 +00006938static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006940 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 const void **ptr)
6942{
6943 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006944
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 if (index != 0) {
6946 PyErr_SetString(PyExc_SystemError,
6947 "accessing non-existent unicode segment");
6948 return -1;
6949 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006950 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 if (str == NULL)
6952 return -1;
6953 *ptr = (void *) PyString_AS_STRING(str);
6954 return PyString_GET_SIZE(str);
6955}
6956
6957/* Helpers for PyUnicode_Format() */
6958
6959static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006960getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006962 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963 if (argidx < arglen) {
6964 (*p_argidx)++;
6965 if (arglen < 0)
6966 return args;
6967 else
6968 return PyTuple_GetItem(args, argidx);
6969 }
6970 PyErr_SetString(PyExc_TypeError,
6971 "not enough arguments for format string");
6972 return NULL;
6973}
6974
6975#define F_LJUST (1<<0)
6976#define F_SIGN (1<<1)
6977#define F_BLANK (1<<2)
6978#define F_ALT (1<<3)
6979#define F_ZERO (1<<4)
6980
Martin v. Löwis18e16552006-02-15 17:27:45 +00006981static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00006982strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006984 register Py_ssize_t i;
6985 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986 for (i = len - 1; i >= 0; i--)
6987 buffer[i] = (Py_UNICODE) charbuffer[i];
6988
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989 return len;
6990}
6991
Neal Norwitzfc76d632006-01-10 06:03:13 +00006992static int
6993doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
6994{
Tim Peters15231542006-02-16 01:08:01 +00006995 Py_ssize_t result;
6996
Neal Norwitzfc76d632006-01-10 06:03:13 +00006997 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006998 result = strtounicode(buffer, (char *)buffer);
6999 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007000}
7001
7002static int
7003longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7004{
Tim Peters15231542006-02-16 01:08:01 +00007005 Py_ssize_t result;
7006
Neal Norwitzfc76d632006-01-10 06:03:13 +00007007 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007008 result = strtounicode(buffer, (char *)buffer);
7009 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007010}
7011
Guido van Rossum078151d2002-08-11 04:24:12 +00007012/* XXX To save some code duplication, formatfloat/long/int could have been
7013 shared with stringobject.c, converting from 8-bit to Unicode after the
7014 formatting is done. */
7015
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016static int
7017formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007018 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 int flags,
7020 int prec,
7021 int type,
7022 PyObject *v)
7023{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007024 /* fmt = '%#.' + `prec` + `type`
7025 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 char fmt[20];
7027 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007028
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029 x = PyFloat_AsDouble(v);
7030 if (x == -1.0 && PyErr_Occurred())
7031 return -1;
7032 if (prec < 0)
7033 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7035 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007036 /* Worst case length calc to ensure no buffer overrun:
7037
7038 'g' formats:
7039 fmt = %#.<prec>g
7040 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7041 for any double rep.)
7042 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7043
7044 'f' formats:
7045 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7046 len = 1 + 50 + 1 + prec = 52 + prec
7047
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007048 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007049 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007050
7051 */
7052 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7053 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007054 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007055 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007056 return -1;
7057 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007058 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7059 (flags&F_ALT) ? "#" : "",
7060 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007061 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062}
7063
Tim Peters38fd5b62000-09-21 05:43:11 +00007064static PyObject*
7065formatlong(PyObject *val, int flags, int prec, int type)
7066{
7067 char *buf;
7068 int i, len;
7069 PyObject *str; /* temporary string object. */
7070 PyUnicodeObject *result;
7071
7072 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7073 if (!str)
7074 return NULL;
7075 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007076 if (!result) {
7077 Py_DECREF(str);
7078 return NULL;
7079 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007080 for (i = 0; i < len; i++)
7081 result->str[i] = buf[i];
7082 result->str[len] = 0;
7083 Py_DECREF(str);
7084 return (PyObject*)result;
7085}
7086
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087static int
7088formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007089 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090 int flags,
7091 int prec,
7092 int type,
7093 PyObject *v)
7094{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007095 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007096 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7097 * + 1 + 1
7098 * = 24
7099 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007100 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007101 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102 long x;
7103
7104 x = PyInt_AsLong(v);
7105 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007106 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007107 if (x < 0 && type == 'u') {
7108 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007109 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007110 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7111 sign = "-";
7112 else
7113 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007115 prec = 1;
7116
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007117 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7118 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007119 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007120 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007121 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007122 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007123 return -1;
7124 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007125
7126 if ((flags & F_ALT) &&
7127 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007128 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007129 * of issues that cause pain:
7130 * - when 0 is being converted, the C standard leaves off
7131 * the '0x' or '0X', which is inconsistent with other
7132 * %#x/%#X conversions and inconsistent with Python's
7133 * hex() function
7134 * - there are platforms that violate the standard and
7135 * convert 0 with the '0x' or '0X'
7136 * (Metrowerks, Compaq Tru64)
7137 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007138 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007139 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007140 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007141 * We can achieve the desired consistency by inserting our
7142 * own '0x' or '0X' prefix, and substituting %x/%X in place
7143 * of %#x/%#X.
7144 *
7145 * Note that this is the same approach as used in
7146 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007147 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007148 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7149 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007150 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007151 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007152 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7153 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007154 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007155 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007156 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007157 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007158 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007159 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160}
7161
7162static int
7163formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007164 size_t buflen,
7165 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007167 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007168 if (PyUnicode_Check(v)) {
7169 if (PyUnicode_GET_SIZE(v) != 1)
7170 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007172 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007174 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007175 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007176 goto onError;
7177 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7178 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179
7180 else {
7181 /* Integer input truncated to a character */
7182 long x;
7183 x = PyInt_AsLong(v);
7184 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007185 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007186#ifdef Py_UNICODE_WIDE
7187 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007188 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007189 "%c arg not in range(0x110000) "
7190 "(wide Python build)");
7191 return -1;
7192 }
7193#else
7194 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007195 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007196 "%c arg not in range(0x10000) "
7197 "(narrow Python build)");
7198 return -1;
7199 }
7200#endif
7201 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202 }
7203 buf[1] = '\0';
7204 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007205
7206 onError:
7207 PyErr_SetString(PyExc_TypeError,
7208 "%c requires int or char");
7209 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210}
7211
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007212/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7213
7214 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7215 chars are formatted. XXX This is a magic number. Each formatting
7216 routine does bounds checking to ensure no overflow, but a better
7217 solution may be to malloc a buffer of appropriate size for each
7218 format. For now, the current solution is sufficient.
7219*/
7220#define FORMATBUFLEN (size_t)120
7221
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222PyObject *PyUnicode_Format(PyObject *format,
7223 PyObject *args)
7224{
7225 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007226 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227 int args_owned = 0;
7228 PyUnicodeObject *result = NULL;
7229 PyObject *dict = NULL;
7230 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007231
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232 if (format == NULL || args == NULL) {
7233 PyErr_BadInternalCall();
7234 return NULL;
7235 }
7236 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007237 if (uformat == NULL)
7238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239 fmt = PyUnicode_AS_UNICODE(uformat);
7240 fmtcnt = PyUnicode_GET_SIZE(uformat);
7241
7242 reslen = rescnt = fmtcnt + 100;
7243 result = _PyUnicode_New(reslen);
7244 if (result == NULL)
7245 goto onError;
7246 res = PyUnicode_AS_UNICODE(result);
7247
7248 if (PyTuple_Check(args)) {
7249 arglen = PyTuple_Size(args);
7250 argidx = 0;
7251 }
7252 else {
7253 arglen = -1;
7254 argidx = -2;
7255 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007256 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7257 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258 dict = args;
7259
7260 while (--fmtcnt >= 0) {
7261 if (*fmt != '%') {
7262 if (--rescnt < 0) {
7263 rescnt = fmtcnt + 100;
7264 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007265 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007266 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7268 --rescnt;
7269 }
7270 *res++ = *fmt++;
7271 }
7272 else {
7273 /* Got a format specifier */
7274 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007275 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277 Py_UNICODE c = '\0';
7278 Py_UNICODE fill;
7279 PyObject *v = NULL;
7280 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007281 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007283 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007284 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285
7286 fmt++;
7287 if (*fmt == '(') {
7288 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007289 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290 PyObject *key;
7291 int pcount = 1;
7292
7293 if (dict == NULL) {
7294 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007295 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296 goto onError;
7297 }
7298 ++fmt;
7299 --fmtcnt;
7300 keystart = fmt;
7301 /* Skip over balanced parentheses */
7302 while (pcount > 0 && --fmtcnt >= 0) {
7303 if (*fmt == ')')
7304 --pcount;
7305 else if (*fmt == '(')
7306 ++pcount;
7307 fmt++;
7308 }
7309 keylen = fmt - keystart - 1;
7310 if (fmtcnt < 0 || pcount > 0) {
7311 PyErr_SetString(PyExc_ValueError,
7312 "incomplete format key");
7313 goto onError;
7314 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007315#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007316 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317 then looked up since Python uses strings to hold
7318 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007319 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320 key = PyUnicode_EncodeUTF8(keystart,
7321 keylen,
7322 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007323#else
7324 key = PyUnicode_FromUnicode(keystart, keylen);
7325#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326 if (key == NULL)
7327 goto onError;
7328 if (args_owned) {
7329 Py_DECREF(args);
7330 args_owned = 0;
7331 }
7332 args = PyObject_GetItem(dict, key);
7333 Py_DECREF(key);
7334 if (args == NULL) {
7335 goto onError;
7336 }
7337 args_owned = 1;
7338 arglen = -1;
7339 argidx = -2;
7340 }
7341 while (--fmtcnt >= 0) {
7342 switch (c = *fmt++) {
7343 case '-': flags |= F_LJUST; continue;
7344 case '+': flags |= F_SIGN; continue;
7345 case ' ': flags |= F_BLANK; continue;
7346 case '#': flags |= F_ALT; continue;
7347 case '0': flags |= F_ZERO; continue;
7348 }
7349 break;
7350 }
7351 if (c == '*') {
7352 v = getnextarg(args, arglen, &argidx);
7353 if (v == NULL)
7354 goto onError;
7355 if (!PyInt_Check(v)) {
7356 PyErr_SetString(PyExc_TypeError,
7357 "* wants int");
7358 goto onError;
7359 }
7360 width = PyInt_AsLong(v);
7361 if (width < 0) {
7362 flags |= F_LJUST;
7363 width = -width;
7364 }
7365 if (--fmtcnt >= 0)
7366 c = *fmt++;
7367 }
7368 else if (c >= '0' && c <= '9') {
7369 width = c - '0';
7370 while (--fmtcnt >= 0) {
7371 c = *fmt++;
7372 if (c < '0' || c > '9')
7373 break;
7374 if ((width*10) / 10 != width) {
7375 PyErr_SetString(PyExc_ValueError,
7376 "width too big");
7377 goto onError;
7378 }
7379 width = width*10 + (c - '0');
7380 }
7381 }
7382 if (c == '.') {
7383 prec = 0;
7384 if (--fmtcnt >= 0)
7385 c = *fmt++;
7386 if (c == '*') {
7387 v = getnextarg(args, arglen, &argidx);
7388 if (v == NULL)
7389 goto onError;
7390 if (!PyInt_Check(v)) {
7391 PyErr_SetString(PyExc_TypeError,
7392 "* wants int");
7393 goto onError;
7394 }
7395 prec = PyInt_AsLong(v);
7396 if (prec < 0)
7397 prec = 0;
7398 if (--fmtcnt >= 0)
7399 c = *fmt++;
7400 }
7401 else if (c >= '0' && c <= '9') {
7402 prec = c - '0';
7403 while (--fmtcnt >= 0) {
7404 c = Py_CHARMASK(*fmt++);
7405 if (c < '0' || c > '9')
7406 break;
7407 if ((prec*10) / 10 != prec) {
7408 PyErr_SetString(PyExc_ValueError,
7409 "prec too big");
7410 goto onError;
7411 }
7412 prec = prec*10 + (c - '0');
7413 }
7414 }
7415 } /* prec */
7416 if (fmtcnt >= 0) {
7417 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418 if (--fmtcnt >= 0)
7419 c = *fmt++;
7420 }
7421 }
7422 if (fmtcnt < 0) {
7423 PyErr_SetString(PyExc_ValueError,
7424 "incomplete format");
7425 goto onError;
7426 }
7427 if (c != '%') {
7428 v = getnextarg(args, arglen, &argidx);
7429 if (v == NULL)
7430 goto onError;
7431 }
7432 sign = 0;
7433 fill = ' ';
7434 switch (c) {
7435
7436 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007437 pbuf = formatbuf;
7438 /* presume that buffer length is at least 1 */
7439 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440 len = 1;
7441 break;
7442
7443 case 's':
7444 case 'r':
7445 if (PyUnicode_Check(v) && c == 's') {
7446 temp = v;
7447 Py_INCREF(temp);
7448 }
7449 else {
7450 PyObject *unicode;
7451 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007452 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 else
7454 temp = PyObject_Repr(v);
7455 if (temp == NULL)
7456 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007457 if (PyUnicode_Check(temp))
7458 /* nothing to do */;
7459 else if (PyString_Check(temp)) {
7460 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007461 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007463 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007465 Py_DECREF(temp);
7466 temp = unicode;
7467 if (temp == NULL)
7468 goto onError;
7469 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007470 else {
7471 Py_DECREF(temp);
7472 PyErr_SetString(PyExc_TypeError,
7473 "%s argument has non-string str()");
7474 goto onError;
7475 }
7476 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007477 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478 len = PyUnicode_GET_SIZE(temp);
7479 if (prec >= 0 && len > prec)
7480 len = prec;
7481 break;
7482
7483 case 'i':
7484 case 'd':
7485 case 'u':
7486 case 'o':
7487 case 'x':
7488 case 'X':
7489 if (c == 'i')
7490 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007491 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007492 temp = formatlong(v, flags, prec, c);
7493 if (!temp)
7494 goto onError;
7495 pbuf = PyUnicode_AS_UNICODE(temp);
7496 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007497 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007499 else {
7500 pbuf = formatbuf;
7501 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7502 flags, prec, c, v);
7503 if (len < 0)
7504 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007505 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007506 }
7507 if (flags & F_ZERO)
7508 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509 break;
7510
7511 case 'e':
7512 case 'E':
7513 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007514 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515 case 'g':
7516 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007517 if (c == 'F')
7518 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007519 pbuf = formatbuf;
7520 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7521 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522 if (len < 0)
7523 goto onError;
7524 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007525 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526 fill = '0';
7527 break;
7528
7529 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007530 pbuf = formatbuf;
7531 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 if (len < 0)
7533 goto onError;
7534 break;
7535
7536 default:
7537 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007538 "unsupported format character '%c' (0x%x) "
7539 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007540 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007541 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007542 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543 goto onError;
7544 }
7545 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007546 if (*pbuf == '-' || *pbuf == '+') {
7547 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548 len--;
7549 }
7550 else if (flags & F_SIGN)
7551 sign = '+';
7552 else if (flags & F_BLANK)
7553 sign = ' ';
7554 else
7555 sign = 0;
7556 }
7557 if (width < len)
7558 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007559 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560 reslen -= rescnt;
7561 rescnt = width + fmtcnt + 100;
7562 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007563 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007564 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007565 PyErr_NoMemory();
7566 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007567 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007568 if (_PyUnicode_Resize(&result, reslen) < 0) {
7569 Py_XDECREF(temp);
7570 goto onError;
7571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572 res = PyUnicode_AS_UNICODE(result)
7573 + reslen - rescnt;
7574 }
7575 if (sign) {
7576 if (fill != ' ')
7577 *res++ = sign;
7578 rescnt--;
7579 if (width > len)
7580 width--;
7581 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007582 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7583 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007584 assert(pbuf[1] == c);
7585 if (fill != ' ') {
7586 *res++ = *pbuf++;
7587 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007588 }
Tim Petersfff53252001-04-12 18:38:48 +00007589 rescnt -= 2;
7590 width -= 2;
7591 if (width < 0)
7592 width = 0;
7593 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595 if (width > len && !(flags & F_LJUST)) {
7596 do {
7597 --rescnt;
7598 *res++ = fill;
7599 } while (--width > len);
7600 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007601 if (fill == ' ') {
7602 if (sign)
7603 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007604 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007605 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007606 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007607 *res++ = *pbuf++;
7608 *res++ = *pbuf++;
7609 }
7610 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007611 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612 res += len;
7613 rescnt -= len;
7614 while (--width >= len) {
7615 --rescnt;
7616 *res++ = ' ';
7617 }
7618 if (dict && (argidx < arglen) && c != '%') {
7619 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007620 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007621 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622 goto onError;
7623 }
7624 Py_XDECREF(temp);
7625 } /* '%' */
7626 } /* until end */
7627 if (argidx < arglen && !dict) {
7628 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007629 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630 goto onError;
7631 }
7632
Thomas Woutersa96affe2006-03-12 00:29:36 +00007633 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7634 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007635 if (args_owned) {
7636 Py_DECREF(args);
7637 }
7638 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639 return (PyObject *)result;
7640
7641 onError:
7642 Py_XDECREF(result);
7643 Py_DECREF(uformat);
7644 if (args_owned) {
7645 Py_DECREF(args);
7646 }
7647 return NULL;
7648}
7649
7650static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007651 (readbufferproc) unicode_buffer_getreadbuf,
7652 (writebufferproc) unicode_buffer_getwritebuf,
7653 (segcountproc) unicode_buffer_getsegcount,
7654 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655};
7656
Jeremy Hylton938ace62002-07-17 16:30:39 +00007657static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007658unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7659
Tim Peters6d6c1a32001-08-02 04:15:00 +00007660static PyObject *
7661unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7662{
7663 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007664 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007665 char *encoding = NULL;
7666 char *errors = NULL;
7667
Guido van Rossume023fe02001-08-30 03:12:59 +00007668 if (type != &PyUnicode_Type)
7669 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007670 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7671 kwlist, &x, &encoding, &errors))
7672 return NULL;
7673 if (x == NULL)
7674 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007675 if (encoding == NULL && errors == NULL)
7676 return PyObject_Unicode(x);
7677 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007678 return PyUnicode_FromEncodedObject(x, encoding, errors);
7679}
7680
Guido van Rossume023fe02001-08-30 03:12:59 +00007681static PyObject *
7682unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7683{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007684 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007685 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007686
7687 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7688 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7689 if (tmp == NULL)
7690 return NULL;
7691 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007692 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007693 if (pnew == NULL) {
7694 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007695 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007696 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007697 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7698 if (pnew->str == NULL) {
7699 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007700 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007701 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007702 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007703 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007704 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7705 pnew->length = n;
7706 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007707 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007708 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007709}
7710
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007711PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007712"unicode(string [, encoding[, errors]]) -> object\n\
7713\n\
7714Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007715encoding defaults to the current default string encoding.\n\
7716errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007717
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718PyTypeObject PyUnicode_Type = {
7719 PyObject_HEAD_INIT(&PyType_Type)
7720 0, /* ob_size */
7721 "unicode", /* tp_name */
7722 sizeof(PyUnicodeObject), /* tp_size */
7723 0, /* tp_itemsize */
7724 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007725 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007727 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728 0, /* tp_setattr */
7729 (cmpfunc) unicode_compare, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007730 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007731 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007733 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734 (hashfunc) unicode_hash, /* tp_hash*/
7735 0, /* tp_call*/
7736 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007737 PyObject_GenericGetAttr, /* tp_getattro */
7738 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007740 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7741 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007742 unicode_doc, /* tp_doc */
7743 0, /* tp_traverse */
7744 0, /* tp_clear */
7745 0, /* tp_richcompare */
7746 0, /* tp_weaklistoffset */
7747 0, /* tp_iter */
7748 0, /* tp_iternext */
7749 unicode_methods, /* tp_methods */
7750 0, /* tp_members */
7751 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007752 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007753 0, /* tp_dict */
7754 0, /* tp_descr_get */
7755 0, /* tp_descr_set */
7756 0, /* tp_dictoffset */
7757 0, /* tp_init */
7758 0, /* tp_alloc */
7759 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007760 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761};
7762
7763/* Initialize the Unicode implementation */
7764
Thomas Wouters78890102000-07-22 19:25:51 +00007765void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007767 int i;
7768
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007769 /* XXX - move this array to unicodectype.c ? */
7770 Py_UNICODE linebreak[] = {
7771 0x000A, /* LINE FEED */
7772 0x000D, /* CARRIAGE RETURN */
7773 0x001C, /* FILE SEPARATOR */
7774 0x001D, /* GROUP SEPARATOR */
7775 0x001E, /* RECORD SEPARATOR */
7776 0x0085, /* NEXT LINE */
7777 0x2028, /* LINE SEPARATOR */
7778 0x2029, /* PARAGRAPH SEPARATOR */
7779 };
7780
Fred Drakee4315f52000-05-09 19:53:39 +00007781 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007782 unicode_freelist = NULL;
7783 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007785 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007786 for (i = 0; i < 256; i++)
7787 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007788 if (PyType_Ready(&PyUnicode_Type) < 0)
7789 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007790
7791 /* initialize the linebreak bloom filter */
7792 bloom_linebreak = make_bloom_mask(
7793 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
7794 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795}
7796
7797/* Finalize the Unicode implementation */
7798
7799void
Thomas Wouters78890102000-07-22 19:25:51 +00007800_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007802 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007803 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007805 Py_XDECREF(unicode_empty);
7806 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007807
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007808 for (i = 0; i < 256; i++) {
7809 if (unicode_latin1[i]) {
7810 Py_DECREF(unicode_latin1[i]);
7811 unicode_latin1[i] = NULL;
7812 }
7813 }
7814
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007815 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816 PyUnicodeObject *v = u;
7817 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007818 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007819 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007820 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007821 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007823 unicode_freelist = NULL;
7824 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007826
Anthony Baxterac6bd462006-04-13 02:06:09 +00007827#ifdef __cplusplus
7828}
7829#endif
7830
7831
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007832/*
7833Local variables:
7834c-basic-offset: 4
7835indent-tabs-mode: nil
7836End:
7837*/