blob: 290e8dfb4759cd6f8b22f4969045ee1eaf3b1e36 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
Fredrik Lundh77633512006-05-23 19:47:35 +0000166 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000187
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000199 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000205 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 PyErr_NoMemory();
207 return -1;
208 }
209 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000210 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000212 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 }
218 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000219
Guido van Rossumd57fd912000-03-10 22:53:23 +0000220 return 0;
221}
222
223/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000224 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
228
229*/
230
231static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233{
234 register PyUnicodeObject *unicode;
235
Andrew Dalkee0df7622006-05-27 11:04:36 +0000236 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
240 }
241
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000245 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000250 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000251 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 }
255 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000256 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000258 }
259 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266 }
267
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000268 if (!unicode->str) {
269 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000270 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000271 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000272 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
278 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000283 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000288 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290}
291
292static
Guido van Rossum9475a232001-10-05 20:51:39 +0000293void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000299 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 unicode->str = NULL;
301 unicode->length = 0;
302 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000306 }
307 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000313 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000314 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000315 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317}
318
Martin v. Löwis18e16552006-02-15 17:27:45 +0000319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000320{
321 register PyUnicodeObject *v;
322
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
327 }
328 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000329 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 PyErr_BadInternalCall();
331 return -1;
332 }
333
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000337 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000344 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000345 *unicode = (PyObject *)w;
346 return 0;
347 }
348
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360{
361 PyUnicodeObject *unicode;
362
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
366
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
371 }
372
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 if (!unicode)
380 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000381 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000382 unicode_latin1[*u] = unicode;
383 }
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
386 }
387 }
Tim Petersced69f82003-09-16 20:30:58 +0000388
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
392
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396
397 return (PyObject *)unicode;
398}
399
400#ifdef HAVE_WCHAR_H
401
402PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000403 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404{
405 PyUnicodeObject *unicode;
406
407 if (w == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
411
412 unicode = _PyUnicode_New(size);
413 if (!unicode)
414 return NULL;
415
416 /* Copy the wchar_t data into the new object */
417#ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000419#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000420 {
421 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000422 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000424 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 *u++ = *w++;
426 }
427#endif
428
429 return (PyObject *)unicode;
430}
431
Martin v. Löwis18e16552006-02-15 17:27:45 +0000432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433 wchar_t *w,
434 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435{
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
439 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000440
441 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000443 size = PyUnicode_GET_SIZE(unicode) + 1;
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445#ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w, unicode->str, size * sizeof(wchar_t));
447#else
448 {
449 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000450 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000452 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453 *w++ = *u++;
454 }
455#endif
456
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000457 if (size > PyUnicode_GET_SIZE(unicode))
458 return PyUnicode_GET_SIZE(unicode);
459 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 return size;
461}
462
463#endif
464
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000465PyObject *PyUnicode_FromOrdinal(int ordinal)
466{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000467 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000468
469#ifdef Py_UNICODE_WIDE
470 if (ordinal < 0 || ordinal > 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
474 return NULL;
475 }
476#else
477 if (ordinal < 0 || ordinal > 0xffff) {
478 PyErr_SetString(PyExc_ValueError,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
481 return NULL;
482 }
483#endif
484
Hye-Shik Chang40574832004-04-06 07:24:51 +0000485 s[0] = (Py_UNICODE)ordinal;
486 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000487}
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489PyObject *PyUnicode_FromObject(register PyObject *obj)
490{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj)) {
494 Py_INCREF(obj);
495 return obj;
496 }
497 if (PyUnicode_Check(obj)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501 PyUnicode_GET_SIZE(obj));
502 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000503 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504}
505
506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507 const char *encoding,
508 const char *errors)
509{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000511 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 if (obj == NULL) {
515 PyErr_BadInternalCall();
516 return NULL;
517 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000518
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000519#if 0
520 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
523 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000524
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
527
528 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529 if (PyUnicode_Check(obj)) {
530 if (encoding) {
531 PyErr_SetString(PyExc_TypeError,
532 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000533 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000536 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537#else
538 if (PyUnicode_Check(obj)) {
539 PyErr_SetString(PyExc_TypeError,
540 "decoding Unicode is not supported");
541 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000542 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000543#endif
544
545 /* Coerce object */
546 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000547 s = PyString_AS_STRING(obj);
548 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000549 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000550 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 "coercing to Unicode: need string or buffer, "
556 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000557 obj->ob_type->tp_name);
558 goto onError;
559 }
Tim Petersced69f82003-09-16 20:30:58 +0000560
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000561 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 if (len == 0) {
563 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000564 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 }
Tim Petersced69f82003-09-16 20:30:58 +0000566 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000567 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000568
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 return v;
570
571 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573}
574
575PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000576 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 const char *encoding,
578 const char *errors)
579{
580 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000581
582 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000583 encoding = PyUnicode_GetDefaultEncoding();
584
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000587 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000588 else if (strcmp(encoding, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s, size, errors);
593#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596
597 /* Decode via the codec registry */
598 buffer = PyBuffer_FromMemory((void *)s, size);
599 if (buffer == NULL)
600 goto onError;
601 unicode = PyCodec_Decode(buffer, encoding, errors);
602 if (unicode == NULL)
603 goto onError;
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000606 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607 unicode->ob_type->tp_name);
608 Py_DECREF(unicode);
609 goto onError;
610 }
611 Py_DECREF(buffer);
612 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000613
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 onError:
615 Py_XDECREF(buffer);
616 return NULL;
617}
618
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620 const char *encoding,
621 const char *errors)
622{
623 PyObject *v;
624
625 if (!PyUnicode_Check(unicode)) {
626 PyErr_BadArgument();
627 goto onError;
628 }
629
630 if (encoding == NULL)
631 encoding = PyUnicode_GetDefaultEncoding();
632
633 /* Decode via the codec registry */
634 v = PyCodec_Decode(unicode, encoding, errors);
635 if (v == NULL)
636 goto onError;
637 return v;
638
639 onError:
640 return NULL;
641}
642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 const char *encoding,
646 const char *errors)
647{
648 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = PyUnicode_FromUnicode(s, size);
651 if (unicode == NULL)
652 return NULL;
653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654 Py_DECREF(unicode);
655 return v;
656}
657
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659 const char *encoding,
660 const char *errors)
661{
662 PyObject *v;
663
664 if (!PyUnicode_Check(unicode)) {
665 PyErr_BadArgument();
666 goto onError;
667 }
668
669 if (encoding == NULL)
670 encoding = PyUnicode_GetDefaultEncoding();
671
672 /* Encode via the codec registry */
673 v = PyCodec_Encode(unicode, encoding, errors);
674 if (v == NULL)
675 goto onError;
676 return v;
677
678 onError:
679 return NULL;
680}
681
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683 const char *encoding,
684 const char *errors)
685{
686 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000687
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688 if (!PyUnicode_Check(unicode)) {
689 PyErr_BadArgument();
690 goto onError;
691 }
Fred Drakee4315f52000-05-09 19:53:39 +0000692
Tim Petersced69f82003-09-16 20:30:58 +0000693 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000694 encoding = PyUnicode_GetDefaultEncoding();
695
696 /* Shortcuts for common default encodings */
697 if (errors == NULL) {
698 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000699 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000700 else if (strcmp(encoding, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode);
705#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000706 else if (strcmp(encoding, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode);
708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709
710 /* Encode via the codec registry */
711 v = PyCodec_Encode(unicode, encoding, errors);
712 if (v == NULL)
713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 if (!PyString_Check(v)) {
715 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000716 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 v->ob_type->tp_name);
718 Py_DECREF(v);
719 goto onError;
720 }
721 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000722
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 onError:
724 return NULL;
725}
726
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728 const char *errors)
729{
730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
731
732 if (v)
733 return v;
734 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735 if (v && errors == NULL)
736 ((PyUnicodeObject *)unicode)->defenc = v;
737 return v;
738}
739
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
741{
742 if (!PyUnicode_Check(unicode)) {
743 PyErr_BadArgument();
744 goto onError;
745 }
746 return PyUnicode_AS_UNICODE(unicode);
747
748 onError:
749 return NULL;
750}
751
Martin v. Löwis18e16552006-02-15 17:27:45 +0000752Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753{
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
757 }
758 return PyUnicode_GET_SIZE(unicode);
759
760 onError:
761 return -1;
762}
763
Thomas Wouters78890102000-07-22 19:25:51 +0000764const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000765{
766 return unicode_default_encoding;
767}
768
769int PyUnicode_SetDefaultEncoding(const char *encoding)
770{
771 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000772
Fred Drakee4315f52000-05-09 19:53:39 +0000773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v = _PyCodec_Lookup(encoding);
776 if (v == NULL)
777 goto onError;
778 Py_DECREF(v);
779 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000780 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000781 sizeof(unicode_default_encoding));
782 return 0;
783
784 onError:
785 return -1;
786}
787
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000788/* error handling callback helper:
789 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000790 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000791 and adjust various state variables.
792 return 0 on success, -1 on error
793*/
794
795static
796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
799 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000800{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000801 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000802
803 PyObject *restuple = NULL;
804 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000805 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
806 Py_ssize_t requiredsize;
807 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000808 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000809 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000810 int res = -1;
811
812 if (*errorHandler == NULL) {
813 *errorHandler = PyCodec_LookupError(errors);
814 if (*errorHandler == NULL)
815 goto onError;
816 }
817
818 if (*exceptionObject == NULL) {
819 *exceptionObject = PyUnicodeDecodeError_Create(
820 encoding, input, insize, *startinpos, *endinpos, reason);
821 if (*exceptionObject == NULL)
822 goto onError;
823 }
824 else {
825 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
826 goto onError;
827 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
828 goto onError;
829 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
830 goto onError;
831 }
832
833 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
834 if (restuple == NULL)
835 goto onError;
836 if (!PyTuple_Check(restuple)) {
837 PyErr_Format(PyExc_TypeError, &argparse[4]);
838 goto onError;
839 }
840 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
841 goto onError;
842 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000843 newpos = insize+newpos;
844 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000845 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000846 goto onError;
847 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000848
849 /* need more space? (at least enough for what we
850 have+the replacement+the rest of the string (starting
851 at the new input position), so we won't have to check space
852 when there are no errors in the rest of the string) */
853 repptr = PyUnicode_AS_UNICODE(repunicode);
854 repsize = PyUnicode_GET_SIZE(repunicode);
855 requiredsize = *outpos + repsize + insize-newpos;
856 if (requiredsize > outsize) {
857 if (requiredsize<2*outsize)
858 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000859 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000860 goto onError;
861 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
862 }
863 *endinpos = newpos;
864 *inptr = input + newpos;
865 Py_UNICODE_COPY(*outptr, repptr, repsize);
866 *outptr += repsize;
867 *outpos += repsize;
868 /* we made it! */
869 res = 0;
870
871 onError:
872 Py_XDECREF(restuple);
873 return res;
874}
875
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000876/* --- UTF-7 Codec -------------------------------------------------------- */
877
878/* see RFC2152 for details */
879
Tim Petersced69f82003-09-16 20:30:58 +0000880static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881char utf7_special[128] = {
882 /* indicate whether a UTF-7 character is special i.e. cannot be directly
883 encoded:
884 0 - not special
885 1 - special
886 2 - whitespace (optional)
887 3 - RFC2152 Set O (optional) */
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
892 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
894 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
896
897};
898
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000899/* Note: The comparison (c) <= 0 is a trick to work-around gcc
900 warnings about the comparison always being false; since
901 utf7_special[0] is 1, we can safely make that one comparison
902 true */
903
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000904#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000905 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000906 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907 (encodeO && (utf7_special[(c)] == 3)))
908
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000909#define B64(n) \
910 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
911#define B64CHAR(c) \
912 (isalnum(c) || (c) == '+' || (c) == '/')
913#define UB64(c) \
914 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
915 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000916
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000917#define ENCODE(out, ch, bits) \
918 while (bits >= 6) { \
919 *out++ = B64(ch >> (bits-6)); \
920 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921 }
922
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000923#define DECODE(out, ch, bits, surrogate) \
924 while (bits >= 16) { \
925 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
926 bits -= 16; \
927 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000928 /* We have already generated an error for the high surrogate \
929 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000930 surrogate = 0; \
931 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000932 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000933 it in a 16-bit character */ \
934 surrogate = 1; \
935 errmsg = "code pairs are not supported"; \
936 goto utf7Error; \
937 } else { \
938 *out++ = outCh; \
939 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000940 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000941
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000943 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 const char *errors)
945{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000946 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000947 Py_ssize_t startinpos;
948 Py_ssize_t endinpos;
949 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950 const char *e;
951 PyUnicodeObject *unicode;
952 Py_UNICODE *p;
953 const char *errmsg = "";
954 int inShift = 0;
955 unsigned int bitsleft = 0;
956 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000957 int surrogate = 0;
958 PyObject *errorHandler = NULL;
959 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000960
961 unicode = _PyUnicode_New(size);
962 if (!unicode)
963 return NULL;
964 if (size == 0)
965 return (PyObject *)unicode;
966
967 p = unicode->str;
968 e = s + size;
969
970 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000971 Py_UNICODE ch;
972 restart:
973 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974
975 if (inShift) {
976 if ((ch == '-') || !B64CHAR(ch)) {
977 inShift = 0;
978 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000979
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
981 if (bitsleft >= 6) {
982 /* The shift sequence has a partial character in it. If
983 bitsleft < 6 then we could just classify it as padding
984 but that is not the case here */
985
986 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000987 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000988 }
989 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000990 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000991 here so indicate the potential of a misencoded character. */
992
993 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
994 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
995 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000996 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 }
998
999 if (ch == '-') {
1000 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001001 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 inShift = 1;
1003 }
1004 } else if (SPECIAL(ch,0,0)) {
1005 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001006 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 } else {
1008 *p++ = ch;
1009 }
1010 } else {
1011 charsleft = (charsleft << 6) | UB64(ch);
1012 bitsleft += 6;
1013 s++;
1014 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1015 }
1016 }
1017 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001019 s++;
1020 if (s < e && *s == '-') {
1021 s++;
1022 *p++ = '+';
1023 } else
1024 {
1025 inShift = 1;
1026 bitsleft = 0;
1027 }
1028 }
1029 else if (SPECIAL(ch,0,0)) {
1030 errmsg = "unexpected special character";
1031 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001032 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001033 }
1034 else {
1035 *p++ = ch;
1036 s++;
1037 }
1038 continue;
1039 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001040 outpos = p-PyUnicode_AS_UNICODE(unicode);
1041 endinpos = s-starts;
1042 if (unicode_decode_call_errorhandler(
1043 errors, &errorHandler,
1044 "utf7", errmsg,
1045 starts, size, &startinpos, &endinpos, &exc, &s,
1046 (PyObject **)&unicode, &outpos, &p))
1047 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001048 }
1049
1050 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001051 outpos = p-PyUnicode_AS_UNICODE(unicode);
1052 endinpos = size;
1053 if (unicode_decode_call_errorhandler(
1054 errors, &errorHandler,
1055 "utf7", "unterminated shift sequence",
1056 starts, size, &startinpos, &endinpos, &exc, &s,
1057 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001059 if (s < e)
1060 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001063 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001064 goto onError;
1065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001066 Py_XDECREF(errorHandler);
1067 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 return (PyObject *)unicode;
1069
1070onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071 Py_XDECREF(errorHandler);
1072 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 Py_DECREF(unicode);
1074 return NULL;
1075}
1076
1077
1078PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001079 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080 int encodeSetO,
1081 int encodeWhiteSpace,
1082 const char *errors)
1083{
1084 PyObject *v;
1085 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001086 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001087 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001089 unsigned int bitsleft = 0;
1090 unsigned long charsleft = 0;
1091 char * out;
1092 char * start;
1093
1094 if (size == 0)
1095 return PyString_FromStringAndSize(NULL, 0);
1096
1097 v = PyString_FromStringAndSize(NULL, cbAllocated);
1098 if (v == NULL)
1099 return NULL;
1100
1101 start = out = PyString_AS_STRING(v);
1102 for (;i < size; ++i) {
1103 Py_UNICODE ch = s[i];
1104
1105 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001106 if (ch == '+') {
1107 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001108 *out++ = '-';
1109 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1110 charsleft = ch;
1111 bitsleft = 16;
1112 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001113 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001114 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001115 } else {
1116 *out++ = (char) ch;
1117 }
1118 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1120 *out++ = B64(charsleft << (6-bitsleft));
1121 charsleft = 0;
1122 bitsleft = 0;
1123 /* Characters not in the BASE64 set implicitly unshift the sequence
1124 so no '-' is required, except if the character is itself a '-' */
1125 if (B64CHAR(ch) || ch == '-') {
1126 *out++ = '-';
1127 }
1128 inShift = 0;
1129 *out++ = (char) ch;
1130 } else {
1131 bitsleft += 16;
1132 charsleft = (charsleft << 16) | ch;
1133 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1134
1135 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001136 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001137 or '-' then the shift sequence will be terminated implicitly and we
1138 don't have to insert a '-'. */
1139
1140 if (bitsleft == 0) {
1141 if (i + 1 < size) {
1142 Py_UNICODE ch2 = s[i+1];
1143
1144 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001145
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001146 } else if (B64CHAR(ch2) || ch2 == '-') {
1147 *out++ = '-';
1148 inShift = 0;
1149 } else {
1150 inShift = 0;
1151 }
1152
1153 }
1154 else {
1155 *out++ = '-';
1156 inShift = 0;
1157 }
1158 }
Tim Petersced69f82003-09-16 20:30:58 +00001159 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001160 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001161 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001162 if (bitsleft) {
1163 *out++= B64(charsleft << (6-bitsleft) );
1164 *out++ = '-';
1165 }
1166
Tim Peters5de98422002-04-27 18:44:32 +00001167 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001168 return v;
1169}
1170
1171#undef SPECIAL
1172#undef B64
1173#undef B64CHAR
1174#undef UB64
1175#undef ENCODE
1176#undef DECODE
1177
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178/* --- UTF-8 Codec -------------------------------------------------------- */
1179
Tim Petersced69f82003-09-16 20:30:58 +00001180static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181char utf8_code_length[256] = {
1182 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1183 illegal prefix. see RFC 2279 for details */
1184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1200};
1201
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001203 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 const char *errors)
1205{
Walter Dörwald69652032004-09-07 20:24:22 +00001206 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1207}
1208
1209PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001210 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001211 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001212 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001213{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001214 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001216 Py_ssize_t startinpos;
1217 Py_ssize_t endinpos;
1218 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 const char *e;
1220 PyUnicodeObject *unicode;
1221 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001222 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001223 PyObject *errorHandler = NULL;
1224 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
1226 /* Note: size will always be longer than the resulting Unicode
1227 character count */
1228 unicode = _PyUnicode_New(size);
1229 if (!unicode)
1230 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001231 if (size == 0) {
1232 if (consumed)
1233 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236
1237 /* Unpack UTF-8 encoded data */
1238 p = unicode->str;
1239 e = s + size;
1240
1241 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001242 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243
1244 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001245 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 s++;
1247 continue;
1248 }
1249
1250 n = utf8_code_length[ch];
1251
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001252 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001253 if (consumed)
1254 break;
1255 else {
1256 errmsg = "unexpected end of data";
1257 startinpos = s-starts;
1258 endinpos = size;
1259 goto utf8Error;
1260 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262
1263 switch (n) {
1264
1265 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001266 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001267 startinpos = s-starts;
1268 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001269 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001270
1271 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001272 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273 startinpos = s-starts;
1274 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001275 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276
1277 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001278 if ((s[1] & 0xc0) != 0x80) {
1279 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
1283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 errmsg = "illegal encoding";
1289 goto utf8Error;
1290 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001292 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 break;
1294
1295 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001296 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 (s[2] & 0xc0) != 0x80) {
1298 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299 startinpos = s-starts;
1300 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 goto utf8Error;
1302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001304 if (ch < 0x0800) {
1305 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001306 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001307
1308 XXX For wide builds (UCS-4) we should probably try
1309 to recombine the surrogates into a single code
1310 unit.
1311 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001312 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001313 startinpos = s-starts;
1314 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001315 goto utf8Error;
1316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001318 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001319 break;
1320
1321 case 4:
1322 if ((s[1] & 0xc0) != 0x80 ||
1323 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001324 (s[3] & 0xc0) != 0x80) {
1325 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 startinpos = s-starts;
1327 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001328 goto utf8Error;
1329 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001330 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1331 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1332 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001333 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001334 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001335 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001336 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001337 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001338 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 startinpos = s-starts;
1340 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001341 goto utf8Error;
1342 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001343#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001344 *p++ = (Py_UNICODE)ch;
1345#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001346 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001347
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 /* translate from 10000..10FFFF to 0..FFFF */
1349 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001350
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001351 /* high surrogate = top 10 bits added to D800 */
1352 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001353
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001354 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001355 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001356#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 break;
1358
1359 default:
1360 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001361 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001362 startinpos = s-starts;
1363 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001364 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 }
1366 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001367 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001368
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001370 outpos = p-PyUnicode_AS_UNICODE(unicode);
1371 if (unicode_decode_call_errorhandler(
1372 errors, &errorHandler,
1373 "utf8", errmsg,
1374 starts, size, &startinpos, &endinpos, &exc, &s,
1375 (PyObject **)&unicode, &outpos, &p))
1376 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 }
Walter Dörwald69652032004-09-07 20:24:22 +00001378 if (consumed)
1379 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380
1381 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001382 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001383 goto onError;
1384
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001385 Py_XDECREF(errorHandler);
1386 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 return (PyObject *)unicode;
1388
1389onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 Py_XDECREF(errorHandler);
1391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 Py_DECREF(unicode);
1393 return NULL;
1394}
1395
Tim Peters602f7402002-04-27 18:03:26 +00001396/* Allocation strategy: if the string is short, convert into a stack buffer
1397 and allocate exactly as much space needed at the end. Else allocate the
1398 maximum possible needed (4 result bytes per Unicode character), and return
1399 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001400*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001401PyObject *
1402PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001403 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001404 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405{
Tim Peters602f7402002-04-27 18:03:26 +00001406#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001407
Martin v. Löwis18e16552006-02-15 17:27:45 +00001408 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001409 PyObject *v; /* result string object */
1410 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001412 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001413 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001414
Tim Peters602f7402002-04-27 18:03:26 +00001415 assert(s != NULL);
1416 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417
Tim Peters602f7402002-04-27 18:03:26 +00001418 if (size <= MAX_SHORT_UNICHARS) {
1419 /* Write into the stack buffer; nallocated can't overflow.
1420 * At the end, we'll allocate exactly as much heap space as it
1421 * turns out we need.
1422 */
1423 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1424 v = NULL; /* will allocate after we're done */
1425 p = stackbuf;
1426 }
1427 else {
1428 /* Overallocate on the heap, and give the excess back at the end. */
1429 nallocated = size * 4;
1430 if (nallocated / 4 != size) /* overflow! */
1431 return PyErr_NoMemory();
1432 v = PyString_FromStringAndSize(NULL, nallocated);
1433 if (v == NULL)
1434 return NULL;
1435 p = PyString_AS_STRING(v);
1436 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001437
Tim Peters602f7402002-04-27 18:03:26 +00001438 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001439 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001440
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001441 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001442 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001444
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001446 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001447 *p++ = (char)(0xc0 | (ch >> 6));
1448 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001449 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001450 else {
Tim Peters602f7402002-04-27 18:03:26 +00001451 /* Encode UCS2 Unicode ordinals */
1452 if (ch < 0x10000) {
1453 /* Special case: check for high surrogate */
1454 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1455 Py_UCS4 ch2 = s[i];
1456 /* Check for low surrogate and combine the two to
1457 form a UCS4 value */
1458 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001459 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001460 i++;
1461 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001462 }
Tim Peters602f7402002-04-27 18:03:26 +00001463 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001464 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001465 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001466 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1467 *p++ = (char)(0x80 | (ch & 0x3f));
1468 continue;
1469 }
1470encodeUCS4:
1471 /* Encode UCS4 Unicode ordinals */
1472 *p++ = (char)(0xf0 | (ch >> 18));
1473 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1474 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1475 *p++ = (char)(0x80 | (ch & 0x3f));
1476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001478
Tim Peters602f7402002-04-27 18:03:26 +00001479 if (v == NULL) {
1480 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001481 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001482 assert(nneeded <= nallocated);
1483 v = PyString_FromStringAndSize(stackbuf, nneeded);
1484 }
1485 else {
1486 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001487 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001488 assert(nneeded <= nallocated);
1489 _PyString_Resize(&v, nneeded);
1490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001492
Tim Peters602f7402002-04-27 18:03:26 +00001493#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494}
1495
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1497{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
1500 return NULL;
1501 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001502 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1503 PyUnicode_GET_SIZE(unicode),
1504 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505}
1506
1507/* --- UTF-16 Codec ------------------------------------------------------- */
1508
Tim Peters772747b2001-08-09 22:21:55 +00001509PyObject *
1510PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001511 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001512 const char *errors,
1513 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001514{
Walter Dörwald69652032004-09-07 20:24:22 +00001515 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1516}
1517
1518PyObject *
1519PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001520 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001521 const char *errors,
1522 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001523 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001524{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001526 Py_ssize_t startinpos;
1527 Py_ssize_t endinpos;
1528 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529 PyUnicodeObject *unicode;
1530 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001531 const unsigned char *q, *e;
1532 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001533 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001534 /* Offsets from q for retrieving byte pairs in the right order. */
1535#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536 int ihi = 1, ilo = 0;
1537#else
1538 int ihi = 0, ilo = 1;
1539#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 PyObject *errorHandler = NULL;
1541 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542
1543 /* Note: size will always be longer than the resulting Unicode
1544 character count */
1545 unicode = _PyUnicode_New(size);
1546 if (!unicode)
1547 return NULL;
1548 if (size == 0)
1549 return (PyObject *)unicode;
1550
1551 /* Unpack UTF-16 encoded data */
1552 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001553 q = (unsigned char *)s;
1554 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555
1556 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001557 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001559 /* Check for BOM marks (U+FEFF) in the input and adjust current
1560 byte order setting accordingly. In native mode, the leading BOM
1561 mark is skipped, in all other modes, it is copied to the output
1562 stream as-is (giving a ZWNBSP character). */
1563 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001564 if (size >= 2) {
1565 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001566#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001567 if (bom == 0xFEFF) {
1568 q += 2;
1569 bo = -1;
1570 }
1571 else if (bom == 0xFFFE) {
1572 q += 2;
1573 bo = 1;
1574 }
Tim Petersced69f82003-09-16 20:30:58 +00001575#else
Walter Dörwald69652032004-09-07 20:24:22 +00001576 if (bom == 0xFEFF) {
1577 q += 2;
1578 bo = 1;
1579 }
1580 else if (bom == 0xFFFE) {
1581 q += 2;
1582 bo = -1;
1583 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001584#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001585 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587
Tim Peters772747b2001-08-09 22:21:55 +00001588 if (bo == -1) {
1589 /* force LE */
1590 ihi = 1;
1591 ilo = 0;
1592 }
1593 else if (bo == 1) {
1594 /* force BE */
1595 ihi = 0;
1596 ilo = 1;
1597 }
1598
1599 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001601 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001602 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001603 if (consumed)
1604 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001605 errmsg = "truncated data";
1606 startinpos = ((const char *)q)-starts;
1607 endinpos = ((const char *)e)-starts;
1608 goto utf16Error;
1609 /* The remaining input chars are ignored if the callback
1610 chooses to skip the input */
1611 }
1612 ch = (q[ihi] << 8) | q[ilo];
1613
Tim Peters772747b2001-08-09 22:21:55 +00001614 q += 2;
1615
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 if (ch < 0xD800 || ch > 0xDFFF) {
1617 *p++ = ch;
1618 continue;
1619 }
1620
1621 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001622 if (q >= e) {
1623 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001624 startinpos = (((const char *)q)-2)-starts;
1625 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001626 goto utf16Error;
1627 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001628 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001629 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1630 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001631 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001632#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001633 *p++ = ch;
1634 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001635#else
1636 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001637#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001638 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001639 }
1640 else {
1641 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 startinpos = (((const char *)q)-4)-starts;
1643 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001644 goto utf16Error;
1645 }
1646
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001648 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 startinpos = (((const char *)q)-2)-starts;
1650 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001651 /* Fall through to report the error */
1652
1653 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001654 outpos = p-PyUnicode_AS_UNICODE(unicode);
1655 if (unicode_decode_call_errorhandler(
1656 errors, &errorHandler,
1657 "utf16", errmsg,
1658 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1659 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 }
1662
1663 if (byteorder)
1664 *byteorder = bo;
1665
Walter Dörwald69652032004-09-07 20:24:22 +00001666 if (consumed)
1667 *consumed = (const char *)q-starts;
1668
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001670 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671 goto onError;
1672
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001673 Py_XDECREF(errorHandler);
1674 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675 return (PyObject *)unicode;
1676
1677onError:
1678 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001679 Py_XDECREF(errorHandler);
1680 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 return NULL;
1682}
1683
Tim Peters772747b2001-08-09 22:21:55 +00001684PyObject *
1685PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001686 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001687 const char *errors,
1688 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001689{
1690 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001691 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001692#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001693 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001694#else
1695 const int pairs = 0;
1696#endif
Tim Peters772747b2001-08-09 22:21:55 +00001697 /* Offsets from p for storing byte pairs in the right order. */
1698#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699 int ihi = 1, ilo = 0;
1700#else
1701 int ihi = 0, ilo = 1;
1702#endif
1703
1704#define STORECHAR(CH) \
1705 do { \
1706 p[ihi] = ((CH) >> 8) & 0xff; \
1707 p[ilo] = (CH) & 0xff; \
1708 p += 2; \
1709 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001711#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001712 for (i = pairs = 0; i < size; i++)
1713 if (s[i] >= 0x10000)
1714 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001715#endif
Tim Petersced69f82003-09-16 20:30:58 +00001716 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001717 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718 if (v == NULL)
1719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720
Tim Peters772747b2001-08-09 22:21:55 +00001721 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001723 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001724 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001725 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001726
1727 if (byteorder == -1) {
1728 /* force LE */
1729 ihi = 1;
1730 ilo = 0;
1731 }
1732 else if (byteorder == 1) {
1733 /* force BE */
1734 ihi = 0;
1735 ilo = 1;
1736 }
1737
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001738 while (size-- > 0) {
1739 Py_UNICODE ch = *s++;
1740 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001741#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001742 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001743 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1744 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001746#endif
Tim Peters772747b2001-08-09 22:21:55 +00001747 STORECHAR(ch);
1748 if (ch2)
1749 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001752#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753}
1754
1755PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1756{
1757 if (!PyUnicode_Check(unicode)) {
1758 PyErr_BadArgument();
1759 return NULL;
1760 }
1761 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1762 PyUnicode_GET_SIZE(unicode),
1763 NULL,
1764 0);
1765}
1766
1767/* --- Unicode Escape Codec ----------------------------------------------- */
1768
Fredrik Lundh06d12682001-01-24 07:59:11 +00001769static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001770
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001772 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773 const char *errors)
1774{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001776 Py_ssize_t startinpos;
1777 Py_ssize_t endinpos;
1778 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001781 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001783 char* message;
1784 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001785 PyObject *errorHandler = NULL;
1786 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001787
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 /* Escaped strings will always be longer than the resulting
1789 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 length after conversion to the true value.
1791 (but if the error callback returns a long replacement string
1792 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 v = _PyUnicode_New(size);
1794 if (v == NULL)
1795 goto onError;
1796 if (size == 0)
1797 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001798
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001799 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001801
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 while (s < end) {
1803 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001804 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001805 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806
1807 /* Non-escape characters are interpreted as Unicode ordinals */
1808 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001809 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 continue;
1811 }
1812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001813 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 /* \ - Escapes */
1815 s++;
1816 switch (*s++) {
1817
1818 /* \x escapes */
1819 case '\n': break;
1820 case '\\': *p++ = '\\'; break;
1821 case '\'': *p++ = '\''; break;
1822 case '\"': *p++ = '\"'; break;
1823 case 'b': *p++ = '\b'; break;
1824 case 'f': *p++ = '\014'; break; /* FF */
1825 case 't': *p++ = '\t'; break;
1826 case 'n': *p++ = '\n'; break;
1827 case 'r': *p++ = '\r'; break;
1828 case 'v': *p++ = '\013'; break; /* VT */
1829 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1830
1831 /* \OOO (octal) escapes */
1832 case '0': case '1': case '2': case '3':
1833 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001834 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001836 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001838 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001840 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 break;
1842
Fredrik Lundhccc74732001-02-18 22:13:49 +00001843 /* hex escapes */
1844 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001846 digits = 2;
1847 message = "truncated \\xXX escape";
1848 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849
Fredrik Lundhccc74732001-02-18 22:13:49 +00001850 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001852 digits = 4;
1853 message = "truncated \\uXXXX escape";
1854 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001855
Fredrik Lundhccc74732001-02-18 22:13:49 +00001856 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001857 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001858 digits = 8;
1859 message = "truncated \\UXXXXXXXX escape";
1860 hexescape:
1861 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 outpos = p-PyUnicode_AS_UNICODE(v);
1863 if (s+digits>end) {
1864 endinpos = size;
1865 if (unicode_decode_call_errorhandler(
1866 errors, &errorHandler,
1867 "unicodeescape", "end of string in escape sequence",
1868 starts, size, &startinpos, &endinpos, &exc, &s,
1869 (PyObject **)&v, &outpos, &p))
1870 goto onError;
1871 goto nextByte;
1872 }
1873 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001874 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001875 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001876 endinpos = (s+i+1)-starts;
1877 if (unicode_decode_call_errorhandler(
1878 errors, &errorHandler,
1879 "unicodeescape", message,
1880 starts, size, &startinpos, &endinpos, &exc, &s,
1881 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001882 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001883 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001884 }
1885 chr = (chr<<4) & ~0xF;
1886 if (c >= '0' && c <= '9')
1887 chr += c - '0';
1888 else if (c >= 'a' && c <= 'f')
1889 chr += 10 + c - 'a';
1890 else
1891 chr += 10 + c - 'A';
1892 }
1893 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001894 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001895 /* _decoding_error will have already written into the
1896 target buffer. */
1897 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001898 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001899 /* when we get here, chr is a 32-bit unicode character */
1900 if (chr <= 0xffff)
1901 /* UCS-2 character */
1902 *p++ = (Py_UNICODE) chr;
1903 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001904 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001905 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001906#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001907 *p++ = chr;
1908#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001909 chr -= 0x10000L;
1910 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001911 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001913 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001914 endinpos = s-starts;
1915 outpos = p-PyUnicode_AS_UNICODE(v);
1916 if (unicode_decode_call_errorhandler(
1917 errors, &errorHandler,
1918 "unicodeescape", "illegal Unicode character",
1919 starts, size, &startinpos, &endinpos, &exc, &s,
1920 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001921 goto onError;
1922 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001923 break;
1924
1925 /* \N{name} */
1926 case 'N':
1927 message = "malformed \\N character escape";
1928 if (ucnhash_CAPI == NULL) {
1929 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001930 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001931 m = PyImport_ImportModule("unicodedata");
1932 if (m == NULL)
1933 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001934 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001935 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001936 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001937 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001938 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001939 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001940 if (ucnhash_CAPI == NULL)
1941 goto ucnhashError;
1942 }
1943 if (*s == '{') {
1944 const char *start = s+1;
1945 /* look for the closing brace */
1946 while (*s != '}' && s < end)
1947 s++;
1948 if (s > start && s < end && *s == '}') {
1949 /* found a name. look it up in the unicode database */
1950 message = "unknown Unicode character name";
1951 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001952 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001953 goto store;
1954 }
1955 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001956 endinpos = s-starts;
1957 outpos = p-PyUnicode_AS_UNICODE(v);
1958 if (unicode_decode_call_errorhandler(
1959 errors, &errorHandler,
1960 "unicodeescape", message,
1961 starts, size, &startinpos, &endinpos, &exc, &s,
1962 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001963 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001964 break;
1965
1966 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001967 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001968 message = "\\ at end of string";
1969 s--;
1970 endinpos = s-starts;
1971 outpos = p-PyUnicode_AS_UNICODE(v);
1972 if (unicode_decode_call_errorhandler(
1973 errors, &errorHandler,
1974 "unicodeescape", message,
1975 starts, size, &startinpos, &endinpos, &exc, &s,
1976 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001977 goto onError;
1978 }
1979 else {
1980 *p++ = '\\';
1981 *p++ = (unsigned char)s[-1];
1982 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001983 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001985 nextByte:
1986 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001988 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001989 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001990 Py_XDECREF(errorHandler);
1991 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001993
Fredrik Lundhccc74732001-02-18 22:13:49 +00001994ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001995 PyErr_SetString(
1996 PyExc_UnicodeError,
1997 "\\N escapes not supported (can't load unicodedata module)"
1998 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001999 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002000 Py_XDECREF(errorHandler);
2001 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002002 return NULL;
2003
Fredrik Lundhccc74732001-02-18 22:13:49 +00002004onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002006 Py_XDECREF(errorHandler);
2007 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 return NULL;
2009}
2010
2011/* Return a Unicode-Escape string version of the Unicode object.
2012
2013 If quotes is true, the string is enclosed in u"" or u'' quotes as
2014 appropriate.
2015
2016*/
2017
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002018Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002019 Py_ssize_t size,
2020 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002021{
2022 /* like wcschr, but doesn't stop at NULL characters */
2023
2024 while (size-- > 0) {
2025 if (*s == ch)
2026 return s;
2027 s++;
2028 }
2029
2030 return NULL;
2031}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002032
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033static
2034PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002035 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 int quotes)
2037{
2038 PyObject *repr;
2039 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002041 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042
Neal Norwitz17753ec2006-08-21 22:21:19 +00002043 /* XXX(nnorwitz): rather than over-allocating, it would be
2044 better to choose a different scheme. Perhaps scan the
2045 first N-chars of the string and allocate based on that size.
2046 */
2047 /* Initial allocation is based on the longest-possible unichr
2048 escape.
2049
2050 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2051 unichr, so in this case it's the longest unichr escape. In
2052 narrow (UTF-16) builds this is five chars per source unichr
2053 since there are two unichrs in the surrogate pair, so in narrow
2054 (UTF-16) builds it's not the longest unichr escape.
2055
2056 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2057 so in the narrow (UTF-16) build case it's the longest unichr
2058 escape.
2059 */
2060
2061 repr = PyString_FromStringAndSize(NULL,
2062 2
2063#ifdef Py_UNICODE_WIDE
2064 + 10*size
2065#else
2066 + 6*size
2067#endif
2068 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 if (repr == NULL)
2070 return NULL;
2071
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002072 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073
2074 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002076 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002077 !findchar(s, size, '"')) ? '"' : '\'';
2078 }
2079 while (size-- > 0) {
2080 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002081
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002082 /* Escape quotes and backslashes */
2083 if ((quotes &&
2084 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 *p++ = '\\';
2086 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002087 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002088 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002089
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002090#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002091 /* Map 21-bit characters to '\U00xxxxxx' */
2092 else if (ch >= 0x10000) {
2093 *p++ = '\\';
2094 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002095 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2096 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2097 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2098 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2099 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2100 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2101 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002102 *p++ = hexdigit[ch & 0x0000000F];
2103 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002104 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002105#else
2106 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002107 else if (ch >= 0xD800 && ch < 0xDC00) {
2108 Py_UNICODE ch2;
2109 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002110
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002111 ch2 = *s++;
2112 size--;
2113 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2114 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2115 *p++ = '\\';
2116 *p++ = 'U';
2117 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2118 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2119 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2120 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2121 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2122 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2123 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2124 *p++ = hexdigit[ucs & 0x0000000F];
2125 continue;
2126 }
2127 /* Fall through: isolated surrogates are copied as-is */
2128 s--;
2129 size++;
2130 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002131#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002132
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002134 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 *p++ = '\\';
2136 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002137 *p++ = hexdigit[(ch >> 12) & 0x000F];
2138 *p++ = hexdigit[(ch >> 8) & 0x000F];
2139 *p++ = hexdigit[(ch >> 4) & 0x000F];
2140 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002141 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002142
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002143 /* Map special whitespace to '\t', \n', '\r' */
2144 else if (ch == '\t') {
2145 *p++ = '\\';
2146 *p++ = 't';
2147 }
2148 else if (ch == '\n') {
2149 *p++ = '\\';
2150 *p++ = 'n';
2151 }
2152 else if (ch == '\r') {
2153 *p++ = '\\';
2154 *p++ = 'r';
2155 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002156
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002157 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002158 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002160 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002161 *p++ = hexdigit[(ch >> 4) & 0x000F];
2162 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002163 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002164
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 /* Copy everything else as-is */
2166 else
2167 *p++ = (char) ch;
2168 }
2169 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002170 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002171
2172 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002173 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174 return repr;
2175}
2176
2177PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002178 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002179{
2180 return unicodeescape_string(s, size, 0);
2181}
2182
2183PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2184{
2185 if (!PyUnicode_Check(unicode)) {
2186 PyErr_BadArgument();
2187 return NULL;
2188 }
2189 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2190 PyUnicode_GET_SIZE(unicode));
2191}
2192
2193/* --- Raw Unicode Escape Codec ------------------------------------------- */
2194
2195PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002196 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 const char *errors)
2198{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002199 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002200 Py_ssize_t startinpos;
2201 Py_ssize_t endinpos;
2202 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002204 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205 const char *end;
2206 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002207 PyObject *errorHandler = NULL;
2208 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002209
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210 /* Escaped strings will always be longer than the resulting
2211 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002212 length after conversion to the true value. (But decoding error
2213 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214 v = _PyUnicode_New(size);
2215 if (v == NULL)
2216 goto onError;
2217 if (size == 0)
2218 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002219 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220 end = s + size;
2221 while (s < end) {
2222 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002223 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002225 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226
2227 /* Non-escape characters are interpreted as Unicode ordinals */
2228 if (*s != '\\') {
2229 *p++ = (unsigned char)*s++;
2230 continue;
2231 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002232 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233
2234 /* \u-escapes are only interpreted iff the number of leading
2235 backslashes if odd */
2236 bs = s;
2237 for (;s < end;) {
2238 if (*s != '\\')
2239 break;
2240 *p++ = (unsigned char)*s++;
2241 }
2242 if (((s - bs) & 1) == 0 ||
2243 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002244 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245 continue;
2246 }
2247 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002248 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249 s++;
2250
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002251 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002252 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002253 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002254 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002255 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002256 endinpos = s-starts;
2257 if (unicode_decode_call_errorhandler(
2258 errors, &errorHandler,
2259 "rawunicodeescape", "truncated \\uXXXX",
2260 starts, size, &startinpos, &endinpos, &exc, &s,
2261 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002263 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002264 }
2265 x = (x<<4) & ~0xF;
2266 if (c >= '0' && c <= '9')
2267 x += c - '0';
2268 else if (c >= 'a' && c <= 'f')
2269 x += 10 + c - 'a';
2270 else
2271 x += 10 + c - 'A';
2272 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002273#ifndef Py_UNICODE_WIDE
2274 if (x > 0x10000) {
2275 if (unicode_decode_call_errorhandler(
2276 errors, &errorHandler,
2277 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2278 starts, size, &startinpos, &endinpos, &exc, &s,
2279 (PyObject **)&v, &outpos, &p))
2280 goto onError;
2281 }
2282#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002283 *p++ = x;
2284 nextByte:
2285 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002286 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002287 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002288 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002289 Py_XDECREF(errorHandler);
2290 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002292
Guido van Rossumd57fd912000-03-10 22:53:23 +00002293 onError:
2294 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002295 Py_XDECREF(errorHandler);
2296 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297 return NULL;
2298}
2299
2300PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002301 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302{
2303 PyObject *repr;
2304 char *p;
2305 char *q;
2306
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002307 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002309#ifdef Py_UNICODE_WIDE
2310 repr = PyString_FromStringAndSize(NULL, 10 * size);
2311#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002313#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002314 if (repr == NULL)
2315 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002316 if (size == 0)
2317 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002318
2319 p = q = PyString_AS_STRING(repr);
2320 while (size-- > 0) {
2321 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002322#ifdef Py_UNICODE_WIDE
2323 /* Map 32-bit characters to '\Uxxxxxxxx' */
2324 if (ch >= 0x10000) {
2325 *p++ = '\\';
2326 *p++ = 'U';
2327 *p++ = hexdigit[(ch >> 28) & 0xf];
2328 *p++ = hexdigit[(ch >> 24) & 0xf];
2329 *p++ = hexdigit[(ch >> 20) & 0xf];
2330 *p++ = hexdigit[(ch >> 16) & 0xf];
2331 *p++ = hexdigit[(ch >> 12) & 0xf];
2332 *p++ = hexdigit[(ch >> 8) & 0xf];
2333 *p++ = hexdigit[(ch >> 4) & 0xf];
2334 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002335 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002336 else
2337#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002338 /* Map 16-bit characters to '\uxxxx' */
2339 if (ch >= 256) {
2340 *p++ = '\\';
2341 *p++ = 'u';
2342 *p++ = hexdigit[(ch >> 12) & 0xf];
2343 *p++ = hexdigit[(ch >> 8) & 0xf];
2344 *p++ = hexdigit[(ch >> 4) & 0xf];
2345 *p++ = hexdigit[ch & 15];
2346 }
2347 /* Copy everything else as-is */
2348 else
2349 *p++ = (char) ch;
2350 }
2351 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002352 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002353 return repr;
2354}
2355
2356PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2357{
2358 if (!PyUnicode_Check(unicode)) {
2359 PyErr_BadArgument();
2360 return NULL;
2361 }
2362 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2363 PyUnicode_GET_SIZE(unicode));
2364}
2365
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002366/* --- Unicode Internal Codec ------------------------------------------- */
2367
2368PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002369 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002370 const char *errors)
2371{
2372 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002373 Py_ssize_t startinpos;
2374 Py_ssize_t endinpos;
2375 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002376 PyUnicodeObject *v;
2377 Py_UNICODE *p;
2378 const char *end;
2379 const char *reason;
2380 PyObject *errorHandler = NULL;
2381 PyObject *exc = NULL;
2382
Neal Norwitzd43069c2006-01-08 01:12:10 +00002383#ifdef Py_UNICODE_WIDE
2384 Py_UNICODE unimax = PyUnicode_GetMax();
2385#endif
2386
Armin Rigo7ccbca92006-10-04 12:17:45 +00002387 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002388 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2389 if (v == NULL)
2390 goto onError;
2391 if (PyUnicode_GetSize((PyObject *)v) == 0)
2392 return (PyObject *)v;
2393 p = PyUnicode_AS_UNICODE(v);
2394 end = s + size;
2395
2396 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002397 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002398 /* We have to sanity check the raw data, otherwise doom looms for
2399 some malformed UCS-4 data. */
2400 if (
2401 #ifdef Py_UNICODE_WIDE
2402 *p > unimax || *p < 0 ||
2403 #endif
2404 end-s < Py_UNICODE_SIZE
2405 )
2406 {
2407 startinpos = s - starts;
2408 if (end-s < Py_UNICODE_SIZE) {
2409 endinpos = end-starts;
2410 reason = "truncated input";
2411 }
2412 else {
2413 endinpos = s - starts + Py_UNICODE_SIZE;
2414 reason = "illegal code point (> 0x10FFFF)";
2415 }
2416 outpos = p - PyUnicode_AS_UNICODE(v);
2417 if (unicode_decode_call_errorhandler(
2418 errors, &errorHandler,
2419 "unicode_internal", reason,
2420 starts, size, &startinpos, &endinpos, &exc, &s,
2421 (PyObject **)&v, &outpos, &p)) {
2422 goto onError;
2423 }
2424 }
2425 else {
2426 p++;
2427 s += Py_UNICODE_SIZE;
2428 }
2429 }
2430
Martin v. Löwis412fb672006-04-13 06:34:32 +00002431 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002432 goto onError;
2433 Py_XDECREF(errorHandler);
2434 Py_XDECREF(exc);
2435 return (PyObject *)v;
2436
2437 onError:
2438 Py_XDECREF(v);
2439 Py_XDECREF(errorHandler);
2440 Py_XDECREF(exc);
2441 return NULL;
2442}
2443
Guido van Rossumd57fd912000-03-10 22:53:23 +00002444/* --- Latin-1 Codec ------------------------------------------------------ */
2445
2446PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002447 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002448 const char *errors)
2449{
2450 PyUnicodeObject *v;
2451 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002452
Guido van Rossumd57fd912000-03-10 22:53:23 +00002453 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002454 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002455 Py_UNICODE r = *(unsigned char*)s;
2456 return PyUnicode_FromUnicode(&r, 1);
2457 }
2458
Guido van Rossumd57fd912000-03-10 22:53:23 +00002459 v = _PyUnicode_New(size);
2460 if (v == NULL)
2461 goto onError;
2462 if (size == 0)
2463 return (PyObject *)v;
2464 p = PyUnicode_AS_UNICODE(v);
2465 while (size-- > 0)
2466 *p++ = (unsigned char)*s++;
2467 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002468
Guido van Rossumd57fd912000-03-10 22:53:23 +00002469 onError:
2470 Py_XDECREF(v);
2471 return NULL;
2472}
2473
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002474/* create or adjust a UnicodeEncodeError */
2475static void make_encode_exception(PyObject **exceptionObject,
2476 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002477 const Py_UNICODE *unicode, Py_ssize_t size,
2478 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002479 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002481 if (*exceptionObject == NULL) {
2482 *exceptionObject = PyUnicodeEncodeError_Create(
2483 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 }
2485 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002486 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2487 goto onError;
2488 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2489 goto onError;
2490 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2491 goto onError;
2492 return;
2493 onError:
2494 Py_DECREF(*exceptionObject);
2495 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496 }
2497}
2498
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002499/* raises a UnicodeEncodeError */
2500static void raise_encode_exception(PyObject **exceptionObject,
2501 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002502 const Py_UNICODE *unicode, Py_ssize_t size,
2503 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002504 const char *reason)
2505{
2506 make_encode_exception(exceptionObject,
2507 encoding, unicode, size, startpos, endpos, reason);
2508 if (*exceptionObject != NULL)
2509 PyCodec_StrictErrors(*exceptionObject);
2510}
2511
2512/* error handling callback helper:
2513 build arguments, call the callback and check the arguments,
2514 put the result into newpos and return the replacement string, which
2515 has to be freed by the caller */
2516static PyObject *unicode_encode_call_errorhandler(const char *errors,
2517 PyObject **errorHandler,
2518 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002519 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2520 Py_ssize_t startpos, Py_ssize_t endpos,
2521 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002522{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002523 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002524
2525 PyObject *restuple;
2526 PyObject *resunicode;
2527
2528 if (*errorHandler == NULL) {
2529 *errorHandler = PyCodec_LookupError(errors);
2530 if (*errorHandler == NULL)
2531 return NULL;
2532 }
2533
2534 make_encode_exception(exceptionObject,
2535 encoding, unicode, size, startpos, endpos, reason);
2536 if (*exceptionObject == NULL)
2537 return NULL;
2538
2539 restuple = PyObject_CallFunctionObjArgs(
2540 *errorHandler, *exceptionObject, NULL);
2541 if (restuple == NULL)
2542 return NULL;
2543 if (!PyTuple_Check(restuple)) {
2544 PyErr_Format(PyExc_TypeError, &argparse[4]);
2545 Py_DECREF(restuple);
2546 return NULL;
2547 }
2548 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2549 &resunicode, newpos)) {
2550 Py_DECREF(restuple);
2551 return NULL;
2552 }
2553 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002554 *newpos = size+*newpos;
2555 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002556 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002557 Py_DECREF(restuple);
2558 return NULL;
2559 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002560 Py_INCREF(resunicode);
2561 Py_DECREF(restuple);
2562 return resunicode;
2563}
2564
2565static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002566 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002567 const char *errors,
2568 int limit)
2569{
2570 /* output object */
2571 PyObject *res;
2572 /* pointers to the beginning and end+1 of input */
2573 const Py_UNICODE *startp = p;
2574 const Py_UNICODE *endp = p + size;
2575 /* pointer to the beginning of the unencodable characters */
2576 /* const Py_UNICODE *badp = NULL; */
2577 /* pointer into the output */
2578 char *str;
2579 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002580 Py_ssize_t respos = 0;
2581 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002582 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2583 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002584 PyObject *errorHandler = NULL;
2585 PyObject *exc = NULL;
2586 /* the following variable is used for caching string comparisons
2587 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2588 int known_errorHandler = -1;
2589
2590 /* allocate enough for a simple encoding without
2591 replacements, if we need more, we'll resize */
2592 res = PyString_FromStringAndSize(NULL, size);
2593 if (res == NULL)
2594 goto onError;
2595 if (size == 0)
2596 return res;
2597 str = PyString_AS_STRING(res);
2598 ressize = size;
2599
2600 while (p<endp) {
2601 Py_UNICODE c = *p;
2602
2603 /* can we encode this? */
2604 if (c<limit) {
2605 /* no overflow check, because we know that the space is enough */
2606 *str++ = (char)c;
2607 ++p;
2608 }
2609 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002610 Py_ssize_t unicodepos = p-startp;
2611 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002612 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002613 Py_ssize_t repsize;
2614 Py_ssize_t newpos;
2615 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002616 Py_UNICODE *uni2;
2617 /* startpos for collecting unencodable chars */
2618 const Py_UNICODE *collstart = p;
2619 const Py_UNICODE *collend = p;
2620 /* find all unecodable characters */
2621 while ((collend < endp) && ((*collend)>=limit))
2622 ++collend;
2623 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2624 if (known_errorHandler==-1) {
2625 if ((errors==NULL) || (!strcmp(errors, "strict")))
2626 known_errorHandler = 1;
2627 else if (!strcmp(errors, "replace"))
2628 known_errorHandler = 2;
2629 else if (!strcmp(errors, "ignore"))
2630 known_errorHandler = 3;
2631 else if (!strcmp(errors, "xmlcharrefreplace"))
2632 known_errorHandler = 4;
2633 else
2634 known_errorHandler = 0;
2635 }
2636 switch (known_errorHandler) {
2637 case 1: /* strict */
2638 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2639 goto onError;
2640 case 2: /* replace */
2641 while (collstart++<collend)
2642 *str++ = '?'; /* fall through */
2643 case 3: /* ignore */
2644 p = collend;
2645 break;
2646 case 4: /* xmlcharrefreplace */
2647 respos = str-PyString_AS_STRING(res);
2648 /* determine replacement size (temporarily (mis)uses p) */
2649 for (p = collstart, repsize = 0; p < collend; ++p) {
2650 if (*p<10)
2651 repsize += 2+1+1;
2652 else if (*p<100)
2653 repsize += 2+2+1;
2654 else if (*p<1000)
2655 repsize += 2+3+1;
2656 else if (*p<10000)
2657 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002658#ifndef Py_UNICODE_WIDE
2659 else
2660 repsize += 2+5+1;
2661#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002662 else if (*p<100000)
2663 repsize += 2+5+1;
2664 else if (*p<1000000)
2665 repsize += 2+6+1;
2666 else
2667 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002668#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002669 }
2670 requiredsize = respos+repsize+(endp-collend);
2671 if (requiredsize > ressize) {
2672 if (requiredsize<2*ressize)
2673 requiredsize = 2*ressize;
2674 if (_PyString_Resize(&res, requiredsize))
2675 goto onError;
2676 str = PyString_AS_STRING(res) + respos;
2677 ressize = requiredsize;
2678 }
2679 /* generate replacement (temporarily (mis)uses p) */
2680 for (p = collstart; p < collend; ++p) {
2681 str += sprintf(str, "&#%d;", (int)*p);
2682 }
2683 p = collend;
2684 break;
2685 default:
2686 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2687 encoding, reason, startp, size, &exc,
2688 collstart-startp, collend-startp, &newpos);
2689 if (repunicode == NULL)
2690 goto onError;
2691 /* need more space? (at least enough for what we
2692 have+the replacement+the rest of the string, so
2693 we won't have to check space for encodable characters) */
2694 respos = str-PyString_AS_STRING(res);
2695 repsize = PyUnicode_GET_SIZE(repunicode);
2696 requiredsize = respos+repsize+(endp-collend);
2697 if (requiredsize > ressize) {
2698 if (requiredsize<2*ressize)
2699 requiredsize = 2*ressize;
2700 if (_PyString_Resize(&res, requiredsize)) {
2701 Py_DECREF(repunicode);
2702 goto onError;
2703 }
2704 str = PyString_AS_STRING(res) + respos;
2705 ressize = requiredsize;
2706 }
2707 /* check if there is anything unencodable in the replacement
2708 and copy it to the output */
2709 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2710 c = *uni2;
2711 if (c >= limit) {
2712 raise_encode_exception(&exc, encoding, startp, size,
2713 unicodepos, unicodepos+1, reason);
2714 Py_DECREF(repunicode);
2715 goto onError;
2716 }
2717 *str = (char)c;
2718 }
2719 p = startp + newpos;
2720 Py_DECREF(repunicode);
2721 }
2722 }
2723 }
2724 /* Resize if we allocated to much */
2725 respos = str-PyString_AS_STRING(res);
2726 if (respos<ressize)
2727 /* If this falls res will be NULL */
2728 _PyString_Resize(&res, respos);
2729 Py_XDECREF(errorHandler);
2730 Py_XDECREF(exc);
2731 return res;
2732
2733 onError:
2734 Py_XDECREF(res);
2735 Py_XDECREF(errorHandler);
2736 Py_XDECREF(exc);
2737 return NULL;
2738}
2739
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002741 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 const char *errors)
2743{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745}
2746
2747PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2748{
2749 if (!PyUnicode_Check(unicode)) {
2750 PyErr_BadArgument();
2751 return NULL;
2752 }
2753 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2754 PyUnicode_GET_SIZE(unicode),
2755 NULL);
2756}
2757
2758/* --- 7-bit ASCII Codec -------------------------------------------------- */
2759
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002761 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 const char *errors)
2763{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002764 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765 PyUnicodeObject *v;
2766 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002767 Py_ssize_t startinpos;
2768 Py_ssize_t endinpos;
2769 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002770 const char *e;
2771 PyObject *errorHandler = NULL;
2772 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002773
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002775 if (size == 1 && *(unsigned char*)s < 128) {
2776 Py_UNICODE r = *(unsigned char*)s;
2777 return PyUnicode_FromUnicode(&r, 1);
2778 }
Tim Petersced69f82003-09-16 20:30:58 +00002779
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 v = _PyUnicode_New(size);
2781 if (v == NULL)
2782 goto onError;
2783 if (size == 0)
2784 return (PyObject *)v;
2785 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002786 e = s + size;
2787 while (s < e) {
2788 register unsigned char c = (unsigned char)*s;
2789 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002791 ++s;
2792 }
2793 else {
2794 startinpos = s-starts;
2795 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002796 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002797 if (unicode_decode_call_errorhandler(
2798 errors, &errorHandler,
2799 "ascii", "ordinal not in range(128)",
2800 starts, size, &startinpos, &endinpos, &exc, &s,
2801 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002803 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002805 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002806 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002807 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002808 Py_XDECREF(errorHandler);
2809 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002811
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812 onError:
2813 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002814 Py_XDECREF(errorHandler);
2815 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 return NULL;
2817}
2818
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002820 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 const char *errors)
2822{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002823 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824}
2825
2826PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2827{
2828 if (!PyUnicode_Check(unicode)) {
2829 PyErr_BadArgument();
2830 return NULL;
2831 }
2832 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2833 PyUnicode_GET_SIZE(unicode),
2834 NULL);
2835}
2836
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002837#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002838
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002839/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002840
Martin v. Löwisd8251432006-06-14 05:21:04 +00002841#if SIZEOF_INT < SIZEOF_SSIZE_T
2842#define NEED_RETRY
2843#endif
2844
2845/* XXX This code is limited to "true" double-byte encodings, as
2846 a) it assumes an incomplete character consists of a single byte, and
2847 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2848 encodings, see IsDBCSLeadByteEx documentation. */
2849
2850static int is_dbcs_lead_byte(const char *s, int offset)
2851{
2852 const char *curr = s + offset;
2853
2854 if (IsDBCSLeadByte(*curr)) {
2855 const char *prev = CharPrev(s, curr);
2856 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2857 }
2858 return 0;
2859}
2860
2861/*
2862 * Decode MBCS string into unicode object. If 'final' is set, converts
2863 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2864 */
2865static int decode_mbcs(PyUnicodeObject **v,
2866 const char *s, /* MBCS string */
2867 int size, /* sizeof MBCS string */
2868 int final)
2869{
2870 Py_UNICODE *p;
2871 Py_ssize_t n = 0;
2872 int usize = 0;
2873
2874 assert(size >= 0);
2875
2876 /* Skip trailing lead-byte unless 'final' is set */
2877 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2878 --size;
2879
2880 /* First get the size of the result */
2881 if (size > 0) {
2882 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2883 if (usize == 0) {
2884 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2885 return -1;
2886 }
2887 }
2888
2889 if (*v == NULL) {
2890 /* Create unicode object */
2891 *v = _PyUnicode_New(usize);
2892 if (*v == NULL)
2893 return -1;
2894 }
2895 else {
2896 /* Extend unicode object */
2897 n = PyUnicode_GET_SIZE(*v);
2898 if (_PyUnicode_Resize(v, n + usize) < 0)
2899 return -1;
2900 }
2901
2902 /* Do the conversion */
2903 if (size > 0) {
2904 p = PyUnicode_AS_UNICODE(*v) + n;
2905 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2906 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2907 return -1;
2908 }
2909 }
2910
2911 return size;
2912}
2913
2914PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2915 Py_ssize_t size,
2916 const char *errors,
2917 Py_ssize_t *consumed)
2918{
2919 PyUnicodeObject *v = NULL;
2920 int done;
2921
2922 if (consumed)
2923 *consumed = 0;
2924
2925#ifdef NEED_RETRY
2926 retry:
2927 if (size > INT_MAX)
2928 done = decode_mbcs(&v, s, INT_MAX, 0);
2929 else
2930#endif
2931 done = decode_mbcs(&v, s, (int)size, !consumed);
2932
2933 if (done < 0) {
2934 Py_XDECREF(v);
2935 return NULL;
2936 }
2937
2938 if (consumed)
2939 *consumed += done;
2940
2941#ifdef NEED_RETRY
2942 if (size > INT_MAX) {
2943 s += done;
2944 size -= done;
2945 goto retry;
2946 }
2947#endif
2948
2949 return (PyObject *)v;
2950}
2951
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002952PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002953 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002954 const char *errors)
2955{
Martin v. Löwisd8251432006-06-14 05:21:04 +00002956 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2957}
2958
2959/*
2960 * Convert unicode into string object (MBCS).
2961 * Returns 0 if succeed, -1 otherwise.
2962 */
2963static int encode_mbcs(PyObject **repr,
2964 const Py_UNICODE *p, /* unicode */
2965 int size) /* size of unicode */
2966{
2967 int mbcssize = 0;
2968 Py_ssize_t n = 0;
2969
2970 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002971
2972 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00002973 if (size > 0) {
2974 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2975 if (mbcssize == 0) {
2976 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2977 return -1;
2978 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002979 }
2980
Martin v. Löwisd8251432006-06-14 05:21:04 +00002981 if (*repr == NULL) {
2982 /* Create string object */
2983 *repr = PyString_FromStringAndSize(NULL, mbcssize);
2984 if (*repr == NULL)
2985 return -1;
2986 }
2987 else {
2988 /* Extend string object */
2989 n = PyString_Size(*repr);
2990 if (_PyString_Resize(repr, n + mbcssize) < 0)
2991 return -1;
2992 }
2993
2994 /* Do the conversion */
2995 if (size > 0) {
2996 char *s = PyString_AS_STRING(*repr) + n;
2997 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2998 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2999 return -1;
3000 }
3001 }
3002
3003 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003004}
3005
3006PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003007 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003008 const char *errors)
3009{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003010 PyObject *repr = NULL;
3011 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003012
Martin v. Löwisd8251432006-06-14 05:21:04 +00003013#ifdef NEED_RETRY
3014 retry:
3015 if (size > INT_MAX)
3016 ret = encode_mbcs(&repr, p, INT_MAX);
3017 else
3018#endif
3019 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003020
Martin v. Löwisd8251432006-06-14 05:21:04 +00003021 if (ret < 0) {
3022 Py_XDECREF(repr);
3023 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003024 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003025
3026#ifdef NEED_RETRY
3027 if (size > INT_MAX) {
3028 p += INT_MAX;
3029 size -= INT_MAX;
3030 goto retry;
3031 }
3032#endif
3033
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003034 return repr;
3035}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003036
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003037PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3038{
3039 if (!PyUnicode_Check(unicode)) {
3040 PyErr_BadArgument();
3041 return NULL;
3042 }
3043 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3044 PyUnicode_GET_SIZE(unicode),
3045 NULL);
3046}
3047
Martin v. Löwisd8251432006-06-14 05:21:04 +00003048#undef NEED_RETRY
3049
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003050#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003051
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052/* --- Character Mapping Codec -------------------------------------------- */
3053
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003055 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 PyObject *mapping,
3057 const char *errors)
3058{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003060 Py_ssize_t startinpos;
3061 Py_ssize_t endinpos;
3062 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003063 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064 PyUnicodeObject *v;
3065 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003066 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003067 PyObject *errorHandler = NULL;
3068 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003069 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003070 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003071
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072 /* Default to Latin-1 */
3073 if (mapping == NULL)
3074 return PyUnicode_DecodeLatin1(s, size, errors);
3075
3076 v = _PyUnicode_New(size);
3077 if (v == NULL)
3078 goto onError;
3079 if (size == 0)
3080 return (PyObject *)v;
3081 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003082 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003083 if (PyUnicode_CheckExact(mapping)) {
3084 mapstring = PyUnicode_AS_UNICODE(mapping);
3085 maplen = PyUnicode_GET_SIZE(mapping);
3086 while (s < e) {
3087 unsigned char ch = *s;
3088 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003090 if (ch < maplen)
3091 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003093 if (x == 0xfffe) {
3094 /* undefined mapping */
3095 outpos = p-PyUnicode_AS_UNICODE(v);
3096 startinpos = s-starts;
3097 endinpos = startinpos+1;
3098 if (unicode_decode_call_errorhandler(
3099 errors, &errorHandler,
3100 "charmap", "character maps to <undefined>",
3101 starts, size, &startinpos, &endinpos, &exc, &s,
3102 (PyObject **)&v, &outpos, &p)) {
3103 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003104 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003105 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003106 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003107 *p++ = x;
3108 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003110 }
3111 else {
3112 while (s < e) {
3113 unsigned char ch = *s;
3114 PyObject *w, *x;
3115
3116 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3117 w = PyInt_FromLong((long)ch);
3118 if (w == NULL)
3119 goto onError;
3120 x = PyObject_GetItem(mapping, w);
3121 Py_DECREF(w);
3122 if (x == NULL) {
3123 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3124 /* No mapping found means: mapping is undefined. */
3125 PyErr_Clear();
3126 x = Py_None;
3127 Py_INCREF(x);
3128 } else
3129 goto onError;
3130 }
3131
3132 /* Apply mapping */
3133 if (PyInt_Check(x)) {
3134 long value = PyInt_AS_LONG(x);
3135 if (value < 0 || value > 65535) {
3136 PyErr_SetString(PyExc_TypeError,
3137 "character mapping must be in range(65536)");
3138 Py_DECREF(x);
3139 goto onError;
3140 }
3141 *p++ = (Py_UNICODE)value;
3142 }
3143 else if (x == Py_None) {
3144 /* undefined mapping */
3145 outpos = p-PyUnicode_AS_UNICODE(v);
3146 startinpos = s-starts;
3147 endinpos = startinpos+1;
3148 if (unicode_decode_call_errorhandler(
3149 errors, &errorHandler,
3150 "charmap", "character maps to <undefined>",
3151 starts, size, &startinpos, &endinpos, &exc, &s,
3152 (PyObject **)&v, &outpos, &p)) {
3153 Py_DECREF(x);
3154 goto onError;
3155 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003156 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003157 continue;
3158 }
3159 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003160 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003161
3162 if (targetsize == 1)
3163 /* 1-1 mapping */
3164 *p++ = *PyUnicode_AS_UNICODE(x);
3165
3166 else if (targetsize > 1) {
3167 /* 1-n mapping */
3168 if (targetsize > extrachars) {
3169 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003170 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3171 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003172 (targetsize << 2);
3173 extrachars += needed;
Armin Rigo7ccbca92006-10-04 12:17:45 +00003174 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003175 if (_PyUnicode_Resize(&v,
3176 PyUnicode_GET_SIZE(v) + needed) < 0) {
3177 Py_DECREF(x);
3178 goto onError;
3179 }
3180 p = PyUnicode_AS_UNICODE(v) + oldpos;
3181 }
3182 Py_UNICODE_COPY(p,
3183 PyUnicode_AS_UNICODE(x),
3184 targetsize);
3185 p += targetsize;
3186 extrachars -= targetsize;
3187 }
3188 /* 1-0 mapping: skip the character */
3189 }
3190 else {
3191 /* wrong return value */
3192 PyErr_SetString(PyExc_TypeError,
3193 "character mapping must return integer, None or unicode");
3194 Py_DECREF(x);
3195 goto onError;
3196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003198 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 }
3201 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003202 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003203 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003204 Py_XDECREF(errorHandler);
3205 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003207
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003209 Py_XDECREF(errorHandler);
3210 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 Py_XDECREF(v);
3212 return NULL;
3213}
3214
Martin v. Löwis3f767792006-06-04 19:36:28 +00003215/* Charmap encoding: the lookup table */
3216
3217struct encoding_map{
3218 PyObject_HEAD
3219 unsigned char level1[32];
3220 int count2, count3;
3221 unsigned char level23[1];
3222};
3223
3224static PyObject*
3225encoding_map_size(PyObject *obj, PyObject* args)
3226{
3227 struct encoding_map *map = (struct encoding_map*)obj;
3228 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3229 128*map->count3);
3230}
3231
3232static PyMethodDef encoding_map_methods[] = {
3233 {"size", encoding_map_size, METH_NOARGS,
3234 PyDoc_STR("Return the size (in bytes) of this object") },
3235 { 0 }
3236};
3237
3238static void
3239encoding_map_dealloc(PyObject* o)
3240{
3241 PyObject_FREE(o);
3242}
3243
3244static PyTypeObject EncodingMapType = {
3245 PyObject_HEAD_INIT(NULL)
3246 0, /*ob_size*/
3247 "EncodingMap", /*tp_name*/
3248 sizeof(struct encoding_map), /*tp_basicsize*/
3249 0, /*tp_itemsize*/
3250 /* methods */
3251 encoding_map_dealloc, /*tp_dealloc*/
3252 0, /*tp_print*/
3253 0, /*tp_getattr*/
3254 0, /*tp_setattr*/
3255 0, /*tp_compare*/
3256 0, /*tp_repr*/
3257 0, /*tp_as_number*/
3258 0, /*tp_as_sequence*/
3259 0, /*tp_as_mapping*/
3260 0, /*tp_hash*/
3261 0, /*tp_call*/
3262 0, /*tp_str*/
3263 0, /*tp_getattro*/
3264 0, /*tp_setattro*/
3265 0, /*tp_as_buffer*/
3266 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3267 0, /*tp_doc*/
3268 0, /*tp_traverse*/
3269 0, /*tp_clear*/
3270 0, /*tp_richcompare*/
3271 0, /*tp_weaklistoffset*/
3272 0, /*tp_iter*/
3273 0, /*tp_iternext*/
3274 encoding_map_methods, /*tp_methods*/
3275 0, /*tp_members*/
3276 0, /*tp_getset*/
3277 0, /*tp_base*/
3278 0, /*tp_dict*/
3279 0, /*tp_descr_get*/
3280 0, /*tp_descr_set*/
3281 0, /*tp_dictoffset*/
3282 0, /*tp_init*/
3283 0, /*tp_alloc*/
3284 0, /*tp_new*/
3285 0, /*tp_free*/
3286 0, /*tp_is_gc*/
3287};
3288
3289PyObject*
3290PyUnicode_BuildEncodingMap(PyObject* string)
3291{
3292 Py_UNICODE *decode;
3293 PyObject *result;
3294 struct encoding_map *mresult;
3295 int i;
3296 int need_dict = 0;
3297 unsigned char level1[32];
3298 unsigned char level2[512];
3299 unsigned char *mlevel1, *mlevel2, *mlevel3;
3300 int count2 = 0, count3 = 0;
3301
3302 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3303 PyErr_BadArgument();
3304 return NULL;
3305 }
3306 decode = PyUnicode_AS_UNICODE(string);
3307 memset(level1, 0xFF, sizeof level1);
3308 memset(level2, 0xFF, sizeof level2);
3309
3310 /* If there isn't a one-to-one mapping of NULL to \0,
3311 or if there are non-BMP characters, we need to use
3312 a mapping dictionary. */
3313 if (decode[0] != 0)
3314 need_dict = 1;
3315 for (i = 1; i < 256; i++) {
3316 int l1, l2;
3317 if (decode[i] == 0
3318 #ifdef Py_UNICODE_WIDE
3319 || decode[i] > 0xFFFF
3320 #endif
3321 ) {
3322 need_dict = 1;
3323 break;
3324 }
3325 if (decode[i] == 0xFFFE)
3326 /* unmapped character */
3327 continue;
3328 l1 = decode[i] >> 11;
3329 l2 = decode[i] >> 7;
3330 if (level1[l1] == 0xFF)
3331 level1[l1] = count2++;
3332 if (level2[l2] == 0xFF)
3333 level2[l2] = count3++;
3334 }
3335
3336 if (count2 >= 0xFF || count3 >= 0xFF)
3337 need_dict = 1;
3338
3339 if (need_dict) {
3340 PyObject *result = PyDict_New();
3341 PyObject *key, *value;
3342 if (!result)
3343 return NULL;
3344 for (i = 0; i < 256; i++) {
3345 key = value = NULL;
3346 key = PyInt_FromLong(decode[i]);
3347 value = PyInt_FromLong(i);
3348 if (!key || !value)
3349 goto failed1;
3350 if (PyDict_SetItem(result, key, value) == -1)
3351 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00003352 Py_DECREF(key);
3353 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003354 }
3355 return result;
3356 failed1:
3357 Py_XDECREF(key);
3358 Py_XDECREF(value);
3359 Py_DECREF(result);
3360 return NULL;
3361 }
3362
3363 /* Create a three-level trie */
3364 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3365 16*count2 + 128*count3 - 1);
3366 if (!result)
3367 return PyErr_NoMemory();
3368 PyObject_Init(result, &EncodingMapType);
3369 mresult = (struct encoding_map*)result;
3370 mresult->count2 = count2;
3371 mresult->count3 = count3;
3372 mlevel1 = mresult->level1;
3373 mlevel2 = mresult->level23;
3374 mlevel3 = mresult->level23 + 16*count2;
3375 memcpy(mlevel1, level1, 32);
3376 memset(mlevel2, 0xFF, 16*count2);
3377 memset(mlevel3, 0, 128*count3);
3378 count3 = 0;
3379 for (i = 1; i < 256; i++) {
3380 int o1, o2, o3, i2, i3;
3381 if (decode[i] == 0xFFFE)
3382 /* unmapped character */
3383 continue;
3384 o1 = decode[i]>>11;
3385 o2 = (decode[i]>>7) & 0xF;
3386 i2 = 16*mlevel1[o1] + o2;
3387 if (mlevel2[i2] == 0xFF)
3388 mlevel2[i2] = count3++;
3389 o3 = decode[i] & 0x7F;
3390 i3 = 128*mlevel2[i2] + o3;
3391 mlevel3[i3] = i;
3392 }
3393 return result;
3394}
3395
3396static int
3397encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3398{
3399 struct encoding_map *map = (struct encoding_map*)mapping;
3400 int l1 = c>>11;
3401 int l2 = (c>>7) & 0xF;
3402 int l3 = c & 0x7F;
3403 int i;
3404
3405#ifdef Py_UNICODE_WIDE
3406 if (c > 0xFFFF) {
3407 return -1;
3408 }
3409#endif
3410 if (c == 0)
3411 return 0;
3412 /* level 1*/
3413 i = map->level1[l1];
3414 if (i == 0xFF) {
3415 return -1;
3416 }
3417 /* level 2*/
3418 i = map->level23[16*i+l2];
3419 if (i == 0xFF) {
3420 return -1;
3421 }
3422 /* level 3 */
3423 i = map->level23[16*map->count2 + 128*i + l3];
3424 if (i == 0) {
3425 return -1;
3426 }
3427 return i;
3428}
3429
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430/* Lookup the character ch in the mapping. If the character
3431 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003432 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003433static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003434{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003435 PyObject *w = PyInt_FromLong((long)c);
3436 PyObject *x;
3437
3438 if (w == NULL)
3439 return NULL;
3440 x = PyObject_GetItem(mapping, w);
3441 Py_DECREF(w);
3442 if (x == NULL) {
3443 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3444 /* No mapping found means: mapping is undefined. */
3445 PyErr_Clear();
3446 x = Py_None;
3447 Py_INCREF(x);
3448 return x;
3449 } else
3450 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003451 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003452 else if (x == Py_None)
3453 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003454 else if (PyInt_Check(x)) {
3455 long value = PyInt_AS_LONG(x);
3456 if (value < 0 || value > 255) {
3457 PyErr_SetString(PyExc_TypeError,
3458 "character mapping must be in range(256)");
3459 Py_DECREF(x);
3460 return NULL;
3461 }
3462 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003464 else if (PyString_Check(x))
3465 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003467 /* wrong return value */
3468 PyErr_SetString(PyExc_TypeError,
3469 "character mapping must return integer, None or str");
3470 Py_DECREF(x);
3471 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003472 }
3473}
3474
Martin v. Löwis3f767792006-06-04 19:36:28 +00003475static int
3476charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3477{
3478 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3479 /* exponentially overallocate to minimize reallocations */
3480 if (requiredsize < 2*outsize)
3481 requiredsize = 2*outsize;
3482 if (_PyString_Resize(outobj, requiredsize)) {
3483 return 0;
3484 }
3485 return 1;
3486}
3487
3488typedef enum charmapencode_result {
3489 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3490}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003491/* lookup the character, put the result in the output string and adjust
3492 various state variables. Reallocate the output string if not enough
3493 space is available. Return a new reference to the object that
3494 was put in the output buffer, or Py_None, if the mapping was undefined
3495 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003496 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497static
Martin v. Löwis3f767792006-06-04 19:36:28 +00003498charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003499 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003500{
Martin v. Löwis3f767792006-06-04 19:36:28 +00003501 PyObject *rep;
3502 char *outstart;
3503 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504
Martin v. Löwis3f767792006-06-04 19:36:28 +00003505 if (mapping->ob_type == &EncodingMapType) {
3506 int res = encoding_map_lookup(c, mapping);
3507 Py_ssize_t requiredsize = *outpos+1;
3508 if (res == -1)
3509 return enc_FAILED;
3510 if (outsize<requiredsize)
3511 if (!charmapencode_resize(outobj, outpos, requiredsize))
3512 return enc_EXCEPTION;
3513 outstart = PyString_AS_STRING(*outobj);
3514 outstart[(*outpos)++] = (char)res;
3515 return enc_SUCCESS;
3516 }
3517
3518 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003519 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003520 return enc_EXCEPTION;
3521 else if (rep==Py_None) {
3522 Py_DECREF(rep);
3523 return enc_FAILED;
3524 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003526 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003527 if (outsize<requiredsize)
3528 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003530 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003532 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3534 }
3535 else {
3536 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003537 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3538 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003539 if (outsize<requiredsize)
3540 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003542 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003544 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 memcpy(outstart + *outpos, repchars, repsize);
3546 *outpos += repsize;
3547 }
3548 }
Georg Brandl9f167602006-06-04 21:46:16 +00003549 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003550 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551}
3552
3553/* handle an error in PyUnicode_EncodeCharmap
3554 Return 0 on success, -1 on error */
3555static
3556int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003557 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003559 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003560 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561{
3562 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003563 Py_ssize_t repsize;
3564 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 Py_UNICODE *uni2;
3566 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003567 Py_ssize_t collstartpos = *inpos;
3568 Py_ssize_t collendpos = *inpos+1;
3569 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 char *encoding = "charmap";
3571 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00003572 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 /* find all unencodable characters */
3575 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003576 PyObject *rep;
3577 if (mapping->ob_type == &EncodingMapType) {
3578 int res = encoding_map_lookup(p[collendpos], mapping);
3579 if (res != -1)
3580 break;
3581 ++collendpos;
3582 continue;
3583 }
3584
3585 rep = charmapencode_lookup(p[collendpos], mapping);
3586 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003588 else if (rep!=Py_None) {
3589 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590 break;
3591 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003592 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593 ++collendpos;
3594 }
3595 /* cache callback name lookup
3596 * (if not done yet, i.e. it's the first error) */
3597 if (*known_errorHandler==-1) {
3598 if ((errors==NULL) || (!strcmp(errors, "strict")))
3599 *known_errorHandler = 1;
3600 else if (!strcmp(errors, "replace"))
3601 *known_errorHandler = 2;
3602 else if (!strcmp(errors, "ignore"))
3603 *known_errorHandler = 3;
3604 else if (!strcmp(errors, "xmlcharrefreplace"))
3605 *known_errorHandler = 4;
3606 else
3607 *known_errorHandler = 0;
3608 }
3609 switch (*known_errorHandler) {
3610 case 1: /* strict */
3611 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3612 return -1;
3613 case 2: /* replace */
3614 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3615 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003616 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003617 return -1;
3618 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003619 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3621 return -1;
3622 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003623 }
3624 /* fall through */
3625 case 3: /* ignore */
3626 *inpos = collendpos;
3627 break;
3628 case 4: /* xmlcharrefreplace */
3629 /* generate replacement (temporarily (mis)uses p) */
3630 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3631 char buffer[2+29+1+1];
3632 char *cp;
3633 sprintf(buffer, "&#%d;", (int)p[collpos]);
3634 for (cp = buffer; *cp; ++cp) {
3635 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003636 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003638 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003639 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3640 return -1;
3641 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003642 }
3643 }
3644 *inpos = collendpos;
3645 break;
3646 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003647 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003648 encoding, reason, p, size, exceptionObject,
3649 collstartpos, collendpos, &newpos);
3650 if (repunicode == NULL)
3651 return -1;
3652 /* generate replacement */
3653 repsize = PyUnicode_GET_SIZE(repunicode);
3654 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3655 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003656 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657 return -1;
3658 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003659 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3662 return -1;
3663 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003664 }
3665 *inpos = newpos;
3666 Py_DECREF(repunicode);
3667 }
3668 return 0;
3669}
3670
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003672 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673 PyObject *mapping,
3674 const char *errors)
3675{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 /* output object */
3677 PyObject *res = NULL;
3678 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003679 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003681 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003682 PyObject *errorHandler = NULL;
3683 PyObject *exc = NULL;
3684 /* the following variable is used for caching string comparisons
3685 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3686 * 3=ignore, 4=xmlcharrefreplace */
3687 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688
3689 /* Default to Latin-1 */
3690 if (mapping == NULL)
3691 return PyUnicode_EncodeLatin1(p, size, errors);
3692
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003693 /* allocate enough for a simple encoding without
3694 replacements, if we need more, we'll resize */
3695 res = PyString_FromStringAndSize(NULL, size);
3696 if (res == NULL)
3697 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003698 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003699 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003701 while (inpos<size) {
3702 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00003703 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3704 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003706 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707 if (charmap_encoding_error(p, size, &inpos, mapping,
3708 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003709 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003710 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003711 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003714 else
3715 /* done with this character => adjust input position */
3716 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 /* Resize if we allocated to much */
3720 if (respos<PyString_GET_SIZE(res)) {
3721 if (_PyString_Resize(&res, respos))
3722 goto onError;
3723 }
3724 Py_XDECREF(exc);
3725 Py_XDECREF(errorHandler);
3726 return res;
3727
3728 onError:
3729 Py_XDECREF(res);
3730 Py_XDECREF(exc);
3731 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003732 return NULL;
3733}
3734
3735PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3736 PyObject *mapping)
3737{
3738 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3739 PyErr_BadArgument();
3740 return NULL;
3741 }
3742 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3743 PyUnicode_GET_SIZE(unicode),
3744 mapping,
3745 NULL);
3746}
3747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003748/* create or adjust a UnicodeTranslateError */
3749static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003750 const Py_UNICODE *unicode, Py_ssize_t size,
3751 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003752 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003754 if (*exceptionObject == NULL) {
3755 *exceptionObject = PyUnicodeTranslateError_Create(
3756 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757 }
3758 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003759 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3760 goto onError;
3761 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3762 goto onError;
3763 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3764 goto onError;
3765 return;
3766 onError:
3767 Py_DECREF(*exceptionObject);
3768 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003769 }
3770}
3771
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003772/* raises a UnicodeTranslateError */
3773static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003774 const Py_UNICODE *unicode, Py_ssize_t size,
3775 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003776 const char *reason)
3777{
3778 make_translate_exception(exceptionObject,
3779 unicode, size, startpos, endpos, reason);
3780 if (*exceptionObject != NULL)
3781 PyCodec_StrictErrors(*exceptionObject);
3782}
3783
3784/* error handling callback helper:
3785 build arguments, call the callback and check the arguments,
3786 put the result into newpos and return the replacement string, which
3787 has to be freed by the caller */
3788static PyObject *unicode_translate_call_errorhandler(const char *errors,
3789 PyObject **errorHandler,
3790 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003791 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3792 Py_ssize_t startpos, Py_ssize_t endpos,
3793 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003794{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003795 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796
Martin v. Löwis412fb672006-04-13 06:34:32 +00003797 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003798 PyObject *restuple;
3799 PyObject *resunicode;
3800
3801 if (*errorHandler == NULL) {
3802 *errorHandler = PyCodec_LookupError(errors);
3803 if (*errorHandler == NULL)
3804 return NULL;
3805 }
3806
3807 make_translate_exception(exceptionObject,
3808 unicode, size, startpos, endpos, reason);
3809 if (*exceptionObject == NULL)
3810 return NULL;
3811
3812 restuple = PyObject_CallFunctionObjArgs(
3813 *errorHandler, *exceptionObject, NULL);
3814 if (restuple == NULL)
3815 return NULL;
3816 if (!PyTuple_Check(restuple)) {
3817 PyErr_Format(PyExc_TypeError, &argparse[4]);
3818 Py_DECREF(restuple);
3819 return NULL;
3820 }
3821 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003822 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003823 Py_DECREF(restuple);
3824 return NULL;
3825 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003826 if (i_newpos<0)
3827 *newpos = size+i_newpos;
3828 else
3829 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003830 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003831 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003832 Py_DECREF(restuple);
3833 return NULL;
3834 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003835 Py_INCREF(resunicode);
3836 Py_DECREF(restuple);
3837 return resunicode;
3838}
3839
3840/* Lookup the character ch in the mapping and put the result in result,
3841 which must be decrefed by the caller.
3842 Return 0 on success, -1 on error */
3843static
3844int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3845{
3846 PyObject *w = PyInt_FromLong((long)c);
3847 PyObject *x;
3848
3849 if (w == NULL)
3850 return -1;
3851 x = PyObject_GetItem(mapping, w);
3852 Py_DECREF(w);
3853 if (x == NULL) {
3854 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3855 /* No mapping found means: use 1:1 mapping. */
3856 PyErr_Clear();
3857 *result = NULL;
3858 return 0;
3859 } else
3860 return -1;
3861 }
3862 else if (x == Py_None) {
3863 *result = x;
3864 return 0;
3865 }
3866 else if (PyInt_Check(x)) {
3867 long value = PyInt_AS_LONG(x);
3868 long max = PyUnicode_GetMax();
3869 if (value < 0 || value > max) {
3870 PyErr_Format(PyExc_TypeError,
3871 "character mapping must be in range(0x%lx)", max+1);
3872 Py_DECREF(x);
3873 return -1;
3874 }
3875 *result = x;
3876 return 0;
3877 }
3878 else if (PyUnicode_Check(x)) {
3879 *result = x;
3880 return 0;
3881 }
3882 else {
3883 /* wrong return value */
3884 PyErr_SetString(PyExc_TypeError,
3885 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003886 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003887 return -1;
3888 }
3889}
3890/* ensure that *outobj is at least requiredsize characters long,
3891if not reallocate and adjust various state variables.
3892Return 0 on success, -1 on error */
3893static
Walter Dörwald4894c302003-10-24 14:25:28 +00003894int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003895 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003896{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003897 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003898 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003899 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003900 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003901 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003902 if (requiredsize < 2 * oldsize)
3903 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003904 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003905 return -1;
3906 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003907 }
3908 return 0;
3909}
3910/* lookup the character, put the result in the output string and adjust
3911 various state variables. Return a new reference to the object that
3912 was put in the output buffer in *result, or Py_None, if the mapping was
3913 undefined (in which case no character was written).
3914 The called must decref result.
3915 Return 0 on success, -1 on error. */
3916static
Walter Dörwald4894c302003-10-24 14:25:28 +00003917int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003918 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003919 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003920{
Walter Dörwald4894c302003-10-24 14:25:28 +00003921 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003922 return -1;
3923 if (*res==NULL) {
3924 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003925 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003926 }
3927 else if (*res==Py_None)
3928 ;
3929 else if (PyInt_Check(*res)) {
3930 /* no overflow check, because we know that the space is enough */
3931 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3932 }
3933 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003934 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 if (repsize==1) {
3936 /* no overflow check, because we know that the space is enough */
3937 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3938 }
3939 else if (repsize!=0) {
3940 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003941 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003942 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003943 repsize - 1;
3944 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945 return -1;
3946 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3947 *outp += repsize;
3948 }
3949 }
3950 else
3951 return -1;
3952 return 0;
3953}
3954
3955PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003956 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 PyObject *mapping,
3958 const char *errors)
3959{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003960 /* output object */
3961 PyObject *res = NULL;
3962 /* pointers to the beginning and end+1 of input */
3963 const Py_UNICODE *startp = p;
3964 const Py_UNICODE *endp = p + size;
3965 /* pointer into the output */
3966 Py_UNICODE *str;
3967 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003968 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003969 char *reason = "character maps to <undefined>";
3970 PyObject *errorHandler = NULL;
3971 PyObject *exc = NULL;
3972 /* the following variable is used for caching string comparisons
3973 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3974 * 3=ignore, 4=xmlcharrefreplace */
3975 int known_errorHandler = -1;
3976
Guido van Rossumd57fd912000-03-10 22:53:23 +00003977 if (mapping == NULL) {
3978 PyErr_BadArgument();
3979 return NULL;
3980 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981
3982 /* allocate enough for a simple 1:1 translation without
3983 replacements, if we need more, we'll resize */
3984 res = PyUnicode_FromUnicode(NULL, size);
3985 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003986 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988 return res;
3989 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003990
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003991 while (p<endp) {
3992 /* try to encode it */
3993 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003994 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003995 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 goto onError;
3997 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003998 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003999 if (x!=Py_None) /* it worked => adjust input pointer */
4000 ++p;
4001 else { /* untranslatable character */
4002 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004003 Py_ssize_t repsize;
4004 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005 Py_UNICODE *uni2;
4006 /* startpos for collecting untranslatable chars */
4007 const Py_UNICODE *collstart = p;
4008 const Py_UNICODE *collend = p+1;
4009 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011 /* find all untranslatable characters */
4012 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004013 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014 goto onError;
4015 Py_XDECREF(x);
4016 if (x!=Py_None)
4017 break;
4018 ++collend;
4019 }
4020 /* cache callback name lookup
4021 * (if not done yet, i.e. it's the first error) */
4022 if (known_errorHandler==-1) {
4023 if ((errors==NULL) || (!strcmp(errors, "strict")))
4024 known_errorHandler = 1;
4025 else if (!strcmp(errors, "replace"))
4026 known_errorHandler = 2;
4027 else if (!strcmp(errors, "ignore"))
4028 known_errorHandler = 3;
4029 else if (!strcmp(errors, "xmlcharrefreplace"))
4030 known_errorHandler = 4;
4031 else
4032 known_errorHandler = 0;
4033 }
4034 switch (known_errorHandler) {
4035 case 1: /* strict */
4036 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4037 goto onError;
4038 case 2: /* replace */
4039 /* No need to check for space, this is a 1:1 replacement */
4040 for (coll = collstart; coll<collend; ++coll)
4041 *str++ = '?';
4042 /* fall through */
4043 case 3: /* ignore */
4044 p = collend;
4045 break;
4046 case 4: /* xmlcharrefreplace */
4047 /* generate replacement (temporarily (mis)uses p) */
4048 for (p = collstart; p < collend; ++p) {
4049 char buffer[2+29+1+1];
4050 char *cp;
4051 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004052 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004053 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4054 goto onError;
4055 for (cp = buffer; *cp; ++cp)
4056 *str++ = *cp;
4057 }
4058 p = collend;
4059 break;
4060 default:
4061 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4062 reason, startp, size, &exc,
4063 collstart-startp, collend-startp, &newpos);
4064 if (repunicode == NULL)
4065 goto onError;
4066 /* generate replacement */
4067 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004068 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004069 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4070 Py_DECREF(repunicode);
4071 goto onError;
4072 }
4073 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4074 *str++ = *uni2;
4075 p = startp + newpos;
4076 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 }
4078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004079 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080 /* Resize if we allocated to much */
4081 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004082 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004083 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004084 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004085 }
4086 Py_XDECREF(exc);
4087 Py_XDECREF(errorHandler);
4088 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004090 onError:
4091 Py_XDECREF(res);
4092 Py_XDECREF(exc);
4093 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094 return NULL;
4095}
4096
4097PyObject *PyUnicode_Translate(PyObject *str,
4098 PyObject *mapping,
4099 const char *errors)
4100{
4101 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004102
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103 str = PyUnicode_FromObject(str);
4104 if (str == NULL)
4105 goto onError;
4106 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4107 PyUnicode_GET_SIZE(str),
4108 mapping,
4109 errors);
4110 Py_DECREF(str);
4111 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004112
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113 onError:
4114 Py_XDECREF(str);
4115 return NULL;
4116}
Tim Petersced69f82003-09-16 20:30:58 +00004117
Guido van Rossum9e896b32000-04-05 20:11:21 +00004118/* --- Decimal Encoder ---------------------------------------------------- */
4119
4120int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004121 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004122 char *output,
4123 const char *errors)
4124{
4125 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126 PyObject *errorHandler = NULL;
4127 PyObject *exc = NULL;
4128 const char *encoding = "decimal";
4129 const char *reason = "invalid decimal Unicode string";
4130 /* the following variable is used for caching string comparisons
4131 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4132 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004133
4134 if (output == NULL) {
4135 PyErr_BadArgument();
4136 return -1;
4137 }
4138
4139 p = s;
4140 end = s + length;
4141 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004142 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004143 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004144 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004145 Py_ssize_t repsize;
4146 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004147 Py_UNICODE *uni2;
4148 Py_UNICODE *collstart;
4149 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004150
Guido van Rossum9e896b32000-04-05 20:11:21 +00004151 if (Py_UNICODE_ISSPACE(ch)) {
4152 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004153 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004154 continue;
4155 }
4156 decimal = Py_UNICODE_TODECIMAL(ch);
4157 if (decimal >= 0) {
4158 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004160 continue;
4161 }
Guido van Rossumba477042000-04-06 18:18:10 +00004162 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004163 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004164 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004165 continue;
4166 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004167 /* All other characters are considered unencodable */
4168 collstart = p;
4169 collend = p+1;
4170 while (collend < end) {
4171 if ((0 < *collend && *collend < 256) ||
4172 !Py_UNICODE_ISSPACE(*collend) ||
4173 Py_UNICODE_TODECIMAL(*collend))
4174 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004175 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 /* cache callback name lookup
4177 * (if not done yet, i.e. it's the first error) */
4178 if (known_errorHandler==-1) {
4179 if ((errors==NULL) || (!strcmp(errors, "strict")))
4180 known_errorHandler = 1;
4181 else if (!strcmp(errors, "replace"))
4182 known_errorHandler = 2;
4183 else if (!strcmp(errors, "ignore"))
4184 known_errorHandler = 3;
4185 else if (!strcmp(errors, "xmlcharrefreplace"))
4186 known_errorHandler = 4;
4187 else
4188 known_errorHandler = 0;
4189 }
4190 switch (known_errorHandler) {
4191 case 1: /* strict */
4192 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4193 goto onError;
4194 case 2: /* replace */
4195 for (p = collstart; p < collend; ++p)
4196 *output++ = '?';
4197 /* fall through */
4198 case 3: /* ignore */
4199 p = collend;
4200 break;
4201 case 4: /* xmlcharrefreplace */
4202 /* generate replacement (temporarily (mis)uses p) */
4203 for (p = collstart; p < collend; ++p)
4204 output += sprintf(output, "&#%d;", (int)*p);
4205 p = collend;
4206 break;
4207 default:
4208 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4209 encoding, reason, s, length, &exc,
4210 collstart-s, collend-s, &newpos);
4211 if (repunicode == NULL)
4212 goto onError;
4213 /* generate replacement */
4214 repsize = PyUnicode_GET_SIZE(repunicode);
4215 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4216 Py_UNICODE ch = *uni2;
4217 if (Py_UNICODE_ISSPACE(ch))
4218 *output++ = ' ';
4219 else {
4220 decimal = Py_UNICODE_TODECIMAL(ch);
4221 if (decimal >= 0)
4222 *output++ = '0' + decimal;
4223 else if (0 < ch && ch < 256)
4224 *output++ = (char)ch;
4225 else {
4226 Py_DECREF(repunicode);
4227 raise_encode_exception(&exc, encoding,
4228 s, length, collstart-s, collend-s, reason);
4229 goto onError;
4230 }
4231 }
4232 }
4233 p = s + newpos;
4234 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004235 }
4236 }
4237 /* 0-terminate the output string */
4238 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239 Py_XDECREF(exc);
4240 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004241 return 0;
4242
4243 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244 Py_XDECREF(exc);
4245 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004246 return -1;
4247}
4248
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249/* --- Helpers ------------------------------------------------------------ */
4250
Fredrik Lundha50d2012006-05-26 17:04:58 +00004251#define STRINGLIB_CHAR Py_UNICODE
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004252
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004253#define STRINGLIB_LEN PyUnicode_GET_SIZE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004254#define STRINGLIB_NEW PyUnicode_FromUnicode
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004255#define STRINGLIB_STR PyUnicode_AS_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004256
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004257Py_LOCAL_INLINE(int)
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004258STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4259{
Fredrik Lundh9c0e9c02006-05-26 18:24:15 +00004260 if (str[0] != other[0])
4261 return 1;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004262 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4263}
4264
Fredrik Lundhb9479482006-05-26 17:22:38 +00004265#define STRINGLIB_EMPTY unicode_empty
4266
Fredrik Lundha50d2012006-05-26 17:04:58 +00004267#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004268
4269#include "stringlib/count.h"
4270#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00004271#include "stringlib/partition.h"
4272
Fredrik Lundhc8162812006-05-26 19:33:03 +00004273/* helper macro to fixup start/end slice values */
4274#define FIX_START_END(obj) \
4275 if (start < 0) \
4276 start += (obj)->length; \
4277 if (start < 0) \
4278 start = 0; \
4279 if (end > (obj)->length) \
4280 end = (obj)->length; \
4281 if (end < 0) \
4282 end += (obj)->length; \
4283 if (end < 0) \
4284 end = 0;
4285
Martin v. Löwis18e16552006-02-15 17:27:45 +00004286Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004287 PyObject *substr,
4288 Py_ssize_t start,
4289 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004290{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004291 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004292 PyUnicodeObject* str_obj;
4293 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004294
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004295 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4296 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004298 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4299 if (!sub_obj) {
4300 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004301 return -1;
4302 }
Tim Petersced69f82003-09-16 20:30:58 +00004303
Fredrik Lundhc8162812006-05-26 19:33:03 +00004304 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004305
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004306 result = stringlib_count(
4307 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4308 );
4309
4310 Py_DECREF(sub_obj);
4311 Py_DECREF(str_obj);
4312
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313 return result;
4314}
4315
Martin v. Löwis18e16552006-02-15 17:27:45 +00004316Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004317 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004318 Py_ssize_t start,
4319 Py_ssize_t end,
4320 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004322 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004323
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004324 str = PyUnicode_FromObject(str);
4325 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004326 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004327 sub = PyUnicode_FromObject(sub);
4328 if (!sub) {
4329 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004330 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331 }
Tim Petersced69f82003-09-16 20:30:58 +00004332
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004333 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004334 result = stringlib_find_slice(
4335 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4336 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4337 start, end
4338 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004339 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004340 result = stringlib_rfind_slice(
4341 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4342 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4343 start, end
4344 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004345
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004346 Py_DECREF(str);
4347 Py_DECREF(sub);
4348
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349 return result;
4350}
4351
Tim Petersced69f82003-09-16 20:30:58 +00004352static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353int tailmatch(PyUnicodeObject *self,
4354 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004355 Py_ssize_t start,
4356 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357 int direction)
4358{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004359 if (substring->length == 0)
4360 return 1;
4361
Fredrik Lundhc8162812006-05-26 19:33:03 +00004362 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004363
4364 end -= substring->length;
4365 if (end < start)
4366 return 0;
4367
4368 if (direction > 0) {
4369 if (Py_UNICODE_MATCH(self, end, substring))
4370 return 1;
4371 } else {
4372 if (Py_UNICODE_MATCH(self, start, substring))
4373 return 1;
4374 }
4375
4376 return 0;
4377}
4378
Martin v. Löwis18e16552006-02-15 17:27:45 +00004379Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004381 Py_ssize_t start,
4382 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383 int direction)
4384{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004385 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004386
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387 str = PyUnicode_FromObject(str);
4388 if (str == NULL)
4389 return -1;
4390 substr = PyUnicode_FromObject(substr);
4391 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004392 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393 return -1;
4394 }
Tim Petersced69f82003-09-16 20:30:58 +00004395
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396 result = tailmatch((PyUnicodeObject *)str,
4397 (PyUnicodeObject *)substr,
4398 start, end, direction);
4399 Py_DECREF(str);
4400 Py_DECREF(substr);
4401 return result;
4402}
4403
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404/* Apply fixfct filter to the Unicode object self and return a
4405 reference to the modified object */
4406
Tim Petersced69f82003-09-16 20:30:58 +00004407static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408PyObject *fixup(PyUnicodeObject *self,
4409 int (*fixfct)(PyUnicodeObject *s))
4410{
4411
4412 PyUnicodeObject *u;
4413
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004414 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415 if (u == NULL)
4416 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004417
4418 Py_UNICODE_COPY(u->str, self->str, self->length);
4419
Tim Peters7a29bd52001-09-12 03:03:31 +00004420 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421 /* fixfct should return TRUE if it modified the buffer. If
4422 FALSE, return a reference to the original buffer instead
4423 (to save space, not time) */
4424 Py_INCREF(self);
4425 Py_DECREF(u);
4426 return (PyObject*) self;
4427 }
4428 return (PyObject*) u;
4429}
4430
Tim Petersced69f82003-09-16 20:30:58 +00004431static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432int fixupper(PyUnicodeObject *self)
4433{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004434 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435 Py_UNICODE *s = self->str;
4436 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004437
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438 while (len-- > 0) {
4439 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004440
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441 ch = Py_UNICODE_TOUPPER(*s);
4442 if (ch != *s) {
4443 status = 1;
4444 *s = ch;
4445 }
4446 s++;
4447 }
4448
4449 return status;
4450}
4451
Tim Petersced69f82003-09-16 20:30:58 +00004452static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453int fixlower(PyUnicodeObject *self)
4454{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004455 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004456 Py_UNICODE *s = self->str;
4457 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004458
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459 while (len-- > 0) {
4460 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004461
Guido van Rossumd57fd912000-03-10 22:53:23 +00004462 ch = Py_UNICODE_TOLOWER(*s);
4463 if (ch != *s) {
4464 status = 1;
4465 *s = ch;
4466 }
4467 s++;
4468 }
4469
4470 return status;
4471}
4472
Tim Petersced69f82003-09-16 20:30:58 +00004473static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474int fixswapcase(PyUnicodeObject *self)
4475{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004476 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 Py_UNICODE *s = self->str;
4478 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004479
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 while (len-- > 0) {
4481 if (Py_UNICODE_ISUPPER(*s)) {
4482 *s = Py_UNICODE_TOLOWER(*s);
4483 status = 1;
4484 } else if (Py_UNICODE_ISLOWER(*s)) {
4485 *s = Py_UNICODE_TOUPPER(*s);
4486 status = 1;
4487 }
4488 s++;
4489 }
4490
4491 return status;
4492}
4493
Tim Petersced69f82003-09-16 20:30:58 +00004494static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495int fixcapitalize(PyUnicodeObject *self)
4496{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004497 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004498 Py_UNICODE *s = self->str;
4499 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004500
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004501 if (len == 0)
4502 return 0;
4503 if (Py_UNICODE_ISLOWER(*s)) {
4504 *s = Py_UNICODE_TOUPPER(*s);
4505 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004507 s++;
4508 while (--len > 0) {
4509 if (Py_UNICODE_ISUPPER(*s)) {
4510 *s = Py_UNICODE_TOLOWER(*s);
4511 status = 1;
4512 }
4513 s++;
4514 }
4515 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516}
4517
4518static
4519int fixtitle(PyUnicodeObject *self)
4520{
4521 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4522 register Py_UNICODE *e;
4523 int previous_is_cased;
4524
4525 /* Shortcut for single character strings */
4526 if (PyUnicode_GET_SIZE(self) == 1) {
4527 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4528 if (*p != ch) {
4529 *p = ch;
4530 return 1;
4531 }
4532 else
4533 return 0;
4534 }
Tim Petersced69f82003-09-16 20:30:58 +00004535
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536 e = p + PyUnicode_GET_SIZE(self);
4537 previous_is_cased = 0;
4538 for (; p < e; p++) {
4539 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004540
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541 if (previous_is_cased)
4542 *p = Py_UNICODE_TOLOWER(ch);
4543 else
4544 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004545
4546 if (Py_UNICODE_ISLOWER(ch) ||
4547 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548 Py_UNICODE_ISTITLE(ch))
4549 previous_is_cased = 1;
4550 else
4551 previous_is_cased = 0;
4552 }
4553 return 1;
4554}
4555
Tim Peters8ce9f162004-08-27 01:49:32 +00004556PyObject *
4557PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004558{
Tim Peters8ce9f162004-08-27 01:49:32 +00004559 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004560 const Py_UNICODE blank = ' ';
4561 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004562 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004563 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004564 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4565 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004566 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4567 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004568 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004569 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004570 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004571
Tim Peters05eba1f2004-08-27 21:32:02 +00004572 fseq = PySequence_Fast(seq, "");
4573 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004574 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004575 }
4576
Tim Peters91879ab2004-08-27 22:35:44 +00004577 /* Grrrr. A codec may be invoked to convert str objects to
4578 * Unicode, and so it's possible to call back into Python code
4579 * during PyUnicode_FromObject(), and so it's possible for a sick
4580 * codec to change the size of fseq (if seq is a list). Therefore
4581 * we have to keep refetching the size -- can't assume seqlen
4582 * is invariant.
4583 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004584 seqlen = PySequence_Fast_GET_SIZE(fseq);
4585 /* If empty sequence, return u"". */
4586 if (seqlen == 0) {
4587 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4588 goto Done;
4589 }
4590 /* If singleton sequence with an exact Unicode, return that. */
4591 if (seqlen == 1) {
4592 item = PySequence_Fast_GET_ITEM(fseq, 0);
4593 if (PyUnicode_CheckExact(item)) {
4594 Py_INCREF(item);
4595 res = (PyUnicodeObject *)item;
4596 goto Done;
4597 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004598 }
4599
Tim Peters05eba1f2004-08-27 21:32:02 +00004600 /* At least two items to join, or one that isn't exact Unicode. */
4601 if (seqlen > 1) {
4602 /* Set up sep and seplen -- they're needed. */
4603 if (separator == NULL) {
4604 sep = &blank;
4605 seplen = 1;
4606 }
4607 else {
4608 internal_separator = PyUnicode_FromObject(separator);
4609 if (internal_separator == NULL)
4610 goto onError;
4611 sep = PyUnicode_AS_UNICODE(internal_separator);
4612 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004613 /* In case PyUnicode_FromObject() mutated seq. */
4614 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004615 }
4616 }
4617
4618 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004619 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004620 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004621 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004622 res_p = PyUnicode_AS_UNICODE(res);
4623 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004624
Tim Peters05eba1f2004-08-27 21:32:02 +00004625 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004626 Py_ssize_t itemlen;
4627 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004628
4629 item = PySequence_Fast_GET_ITEM(fseq, i);
4630 /* Convert item to Unicode. */
4631 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4632 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004633 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004634 " %.80s found",
4635 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004636 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004637 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004638 item = PyUnicode_FromObject(item);
4639 if (item == NULL)
4640 goto onError;
4641 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004642
Tim Peters91879ab2004-08-27 22:35:44 +00004643 /* In case PyUnicode_FromObject() mutated seq. */
4644 seqlen = PySequence_Fast_GET_SIZE(fseq);
4645
Tim Peters8ce9f162004-08-27 01:49:32 +00004646 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004647 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004648 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004649 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004650 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004651 if (i < seqlen - 1) {
4652 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004653 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004654 goto Overflow;
4655 }
4656 if (new_res_used > res_alloc) {
4657 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004658 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004659 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004660 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004661 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004662 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004663 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004664 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004665 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004666 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004667 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004668 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004669
4670 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004671 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004672 res_p += itemlen;
4673 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004674 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004675 res_p += seplen;
4676 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004677 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004678 res_used = new_res_used;
4679 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004680
Tim Peters05eba1f2004-08-27 21:32:02 +00004681 /* Shrink res to match the used area; this probably can't fail,
4682 * but it's cheap to check.
4683 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004684 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004685 goto onError;
4686
4687 Done:
4688 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004689 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004690 return (PyObject *)res;
4691
Tim Peters8ce9f162004-08-27 01:49:32 +00004692 Overflow:
4693 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00004694 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004695 Py_DECREF(item);
4696 /* fall through */
4697
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004699 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004700 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004701 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702 return NULL;
4703}
4704
Tim Petersced69f82003-09-16 20:30:58 +00004705static
4706PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004707 Py_ssize_t left,
4708 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709 Py_UNICODE fill)
4710{
4711 PyUnicodeObject *u;
4712
4713 if (left < 0)
4714 left = 0;
4715 if (right < 0)
4716 right = 0;
4717
Tim Peters7a29bd52001-09-12 03:03:31 +00004718 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719 Py_INCREF(self);
4720 return self;
4721 }
4722
4723 u = _PyUnicode_New(left + self->length + right);
4724 if (u) {
4725 if (left)
4726 Py_UNICODE_FILL(u->str, fill, left);
4727 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4728 if (right)
4729 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4730 }
4731
4732 return u;
4733}
4734
4735#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004736 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737 if (!str) \
4738 goto onError; \
4739 if (PyList_Append(list, str)) { \
4740 Py_DECREF(str); \
4741 goto onError; \
4742 } \
4743 else \
4744 Py_DECREF(str);
4745
4746static
4747PyObject *split_whitespace(PyUnicodeObject *self,
4748 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004749 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004751 register Py_ssize_t i;
4752 register Py_ssize_t j;
4753 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754 PyObject *str;
4755
4756 for (i = j = 0; i < len; ) {
4757 /* find a token */
4758 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4759 i++;
4760 j = i;
4761 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4762 i++;
4763 if (j < i) {
4764 if (maxcount-- <= 0)
4765 break;
4766 SPLIT_APPEND(self->str, j, i);
4767 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4768 i++;
4769 j = i;
4770 }
4771 }
4772 if (j < len) {
4773 SPLIT_APPEND(self->str, j, len);
4774 }
4775 return list;
4776
4777 onError:
4778 Py_DECREF(list);
4779 return NULL;
4780}
4781
4782PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004783 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004785 register Py_ssize_t i;
4786 register Py_ssize_t j;
4787 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788 PyObject *list;
4789 PyObject *str;
4790 Py_UNICODE *data;
4791
4792 string = PyUnicode_FromObject(string);
4793 if (string == NULL)
4794 return NULL;
4795 data = PyUnicode_AS_UNICODE(string);
4796 len = PyUnicode_GET_SIZE(string);
4797
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798 list = PyList_New(0);
4799 if (!list)
4800 goto onError;
4801
4802 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004803 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004804
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004806 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808
4809 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004810 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811 if (i < len) {
4812 if (data[i] == '\r' && i + 1 < len &&
4813 data[i+1] == '\n')
4814 i += 2;
4815 else
4816 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004817 if (keepends)
4818 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819 }
Guido van Rossum86662912000-04-11 15:38:46 +00004820 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821 j = i;
4822 }
4823 if (j < len) {
4824 SPLIT_APPEND(data, j, len);
4825 }
4826
4827 Py_DECREF(string);
4828 return list;
4829
4830 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004831 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832 Py_DECREF(string);
4833 return NULL;
4834}
4835
Tim Petersced69f82003-09-16 20:30:58 +00004836static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837PyObject *split_char(PyUnicodeObject *self,
4838 PyObject *list,
4839 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004840 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004842 register Py_ssize_t i;
4843 register Py_ssize_t j;
4844 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845 PyObject *str;
4846
4847 for (i = j = 0; i < len; ) {
4848 if (self->str[i] == ch) {
4849 if (maxcount-- <= 0)
4850 break;
4851 SPLIT_APPEND(self->str, j, i);
4852 i = j = i + 1;
4853 } else
4854 i++;
4855 }
4856 if (j <= len) {
4857 SPLIT_APPEND(self->str, j, len);
4858 }
4859 return list;
4860
4861 onError:
4862 Py_DECREF(list);
4863 return NULL;
4864}
4865
Tim Petersced69f82003-09-16 20:30:58 +00004866static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867PyObject *split_substring(PyUnicodeObject *self,
4868 PyObject *list,
4869 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004870 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004872 register Py_ssize_t i;
4873 register Py_ssize_t j;
4874 Py_ssize_t len = self->length;
4875 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 PyObject *str;
4877
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004878 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879 if (Py_UNICODE_MATCH(self, i, substring)) {
4880 if (maxcount-- <= 0)
4881 break;
4882 SPLIT_APPEND(self->str, j, i);
4883 i = j = i + sublen;
4884 } else
4885 i++;
4886 }
4887 if (j <= len) {
4888 SPLIT_APPEND(self->str, j, len);
4889 }
4890 return list;
4891
4892 onError:
4893 Py_DECREF(list);
4894 return NULL;
4895}
4896
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004897static
4898PyObject *rsplit_whitespace(PyUnicodeObject *self,
4899 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004900 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004901{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004902 register Py_ssize_t i;
4903 register Py_ssize_t j;
4904 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004905 PyObject *str;
4906
4907 for (i = j = len - 1; i >= 0; ) {
4908 /* find a token */
4909 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4910 i--;
4911 j = i;
4912 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4913 i--;
4914 if (j > i) {
4915 if (maxcount-- <= 0)
4916 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004917 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004918 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4919 i--;
4920 j = i;
4921 }
4922 }
4923 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004924 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004925 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004926 if (PyList_Reverse(list) < 0)
4927 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004928 return list;
4929
4930 onError:
4931 Py_DECREF(list);
4932 return NULL;
4933}
4934
4935static
4936PyObject *rsplit_char(PyUnicodeObject *self,
4937 PyObject *list,
4938 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004939 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004940{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004941 register Py_ssize_t i;
4942 register Py_ssize_t j;
4943 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004944 PyObject *str;
4945
4946 for (i = j = len - 1; i >= 0; ) {
4947 if (self->str[i] == ch) {
4948 if (maxcount-- <= 0)
4949 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004950 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004951 j = i = i - 1;
4952 } else
4953 i--;
4954 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004955 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004956 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004957 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004958 if (PyList_Reverse(list) < 0)
4959 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004960 return list;
4961
4962 onError:
4963 Py_DECREF(list);
4964 return NULL;
4965}
4966
4967static
4968PyObject *rsplit_substring(PyUnicodeObject *self,
4969 PyObject *list,
4970 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004971 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004972{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004973 register Py_ssize_t i;
4974 register Py_ssize_t j;
4975 Py_ssize_t len = self->length;
4976 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004977 PyObject *str;
4978
4979 for (i = len - sublen, j = len; i >= 0; ) {
4980 if (Py_UNICODE_MATCH(self, i, substring)) {
4981 if (maxcount-- <= 0)
4982 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004983 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004984 j = i;
4985 i -= sublen;
4986 } else
4987 i--;
4988 }
4989 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004990 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004991 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004992 if (PyList_Reverse(list) < 0)
4993 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004994 return list;
4995
4996 onError:
4997 Py_DECREF(list);
4998 return NULL;
4999}
5000
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001#undef SPLIT_APPEND
5002
5003static
5004PyObject *split(PyUnicodeObject *self,
5005 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005006 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005007{
5008 PyObject *list;
5009
5010 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005011 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012
5013 list = PyList_New(0);
5014 if (!list)
5015 return NULL;
5016
5017 if (substring == NULL)
5018 return split_whitespace(self,list,maxcount);
5019
5020 else if (substring->length == 1)
5021 return split_char(self,list,substring->str[0],maxcount);
5022
5023 else if (substring->length == 0) {
5024 Py_DECREF(list);
5025 PyErr_SetString(PyExc_ValueError, "empty separator");
5026 return NULL;
5027 }
5028 else
5029 return split_substring(self,list,substring,maxcount);
5030}
5031
Tim Petersced69f82003-09-16 20:30:58 +00005032static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005033PyObject *rsplit(PyUnicodeObject *self,
5034 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005035 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005036{
5037 PyObject *list;
5038
5039 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005040 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005041
5042 list = PyList_New(0);
5043 if (!list)
5044 return NULL;
5045
5046 if (substring == NULL)
5047 return rsplit_whitespace(self,list,maxcount);
5048
5049 else if (substring->length == 1)
5050 return rsplit_char(self,list,substring->str[0],maxcount);
5051
5052 else if (substring->length == 0) {
5053 Py_DECREF(list);
5054 PyErr_SetString(PyExc_ValueError, "empty separator");
5055 return NULL;
5056 }
5057 else
5058 return rsplit_substring(self,list,substring,maxcount);
5059}
5060
5061static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062PyObject *replace(PyUnicodeObject *self,
5063 PyUnicodeObject *str1,
5064 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005065 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066{
5067 PyUnicodeObject *u;
5068
5069 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005070 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071
Fredrik Lundh347ee272006-05-24 16:35:18 +00005072 if (str1->length == str2->length) {
5073 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005074 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005075 if (str1->length == 1) {
5076 /* replace characters */
5077 Py_UNICODE u1, u2;
5078 if (!findchar(self->str, self->length, str1->str[0]))
5079 goto nothing;
5080 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5081 if (!u)
5082 return NULL;
5083 Py_UNICODE_COPY(u->str, self->str, self->length);
5084 u1 = str1->str[0];
5085 u2 = str2->str[0];
5086 for (i = 0; i < u->length; i++)
5087 if (u->str[i] == u1) {
5088 if (--maxcount < 0)
5089 break;
5090 u->str[i] = u2;
5091 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005093 i = fastsearch(
5094 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005096 if (i < 0)
5097 goto nothing;
5098 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5099 if (!u)
5100 return NULL;
5101 Py_UNICODE_COPY(u->str, self->str, self->length);
5102 while (i <= self->length - str1->length)
5103 if (Py_UNICODE_MATCH(self, i, str1)) {
5104 if (--maxcount < 0)
5105 break;
5106 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5107 i += str1->length;
5108 } else
5109 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005112
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005113 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005114 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115 Py_UNICODE *p;
5116
5117 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005118 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119 if (n > maxcount)
5120 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005121 if (n == 0)
5122 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005123 /* new_size = self->length + n * (str2->length - str1->length)); */
5124 delta = (str2->length - str1->length);
5125 if (delta == 0) {
5126 new_size = self->length;
5127 } else {
5128 product = n * (str2->length - str1->length);
5129 if ((product / (str2->length - str1->length)) != n) {
5130 PyErr_SetString(PyExc_OverflowError,
5131 "replace string is too long");
5132 return NULL;
5133 }
5134 new_size = self->length + product;
5135 if (new_size < 0) {
5136 PyErr_SetString(PyExc_OverflowError,
5137 "replace string is too long");
5138 return NULL;
5139 }
5140 }
5141 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005142 if (!u)
5143 return NULL;
5144 i = 0;
5145 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005146 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005147 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005148 while (n-- > 0) {
5149 /* look for next match */
5150 j = i;
5151 while (j <= e) {
5152 if (Py_UNICODE_MATCH(self, j, str1))
5153 break;
5154 j++;
5155 }
5156 if (j > i) {
5157 if (j > e)
5158 break;
5159 /* copy unchanged part [i:j] */
5160 Py_UNICODE_COPY(p, self->str+i, j-i);
5161 p += j - i;
5162 }
5163 /* copy substitution string */
5164 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005165 Py_UNICODE_COPY(p, str2->str, str2->length);
5166 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005167 }
5168 i = j + str1->length;
5169 }
5170 if (i < self->length)
5171 /* copy tail [i:] */
5172 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005173 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005174 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005175 while (n > 0) {
5176 Py_UNICODE_COPY(p, str2->str, str2->length);
5177 p += str2->length;
5178 if (--n <= 0)
5179 break;
5180 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005182 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 }
5184 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005186
5187nothing:
5188 /* nothing to replace; return original string (when possible) */
5189 if (PyUnicode_CheckExact(self)) {
5190 Py_INCREF(self);
5191 return (PyObject *) self;
5192 }
5193 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194}
5195
5196/* --- Unicode Object Methods --------------------------------------------- */
5197
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005198PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199"S.title() -> unicode\n\
5200\n\
5201Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005202characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203
5204static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005205unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207 return fixup(self, fixtitle);
5208}
5209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005210PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211"S.capitalize() -> unicode\n\
5212\n\
5213Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005214have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215
5216static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005217unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219 return fixup(self, fixcapitalize);
5220}
5221
5222#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005223PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224"S.capwords() -> unicode\n\
5225\n\
5226Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005227normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228
5229static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005230unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231{
5232 PyObject *list;
5233 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005234 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 /* Split into words */
5237 list = split(self, NULL, -1);
5238 if (!list)
5239 return NULL;
5240
5241 /* Capitalize each word */
5242 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5243 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5244 fixcapitalize);
5245 if (item == NULL)
5246 goto onError;
5247 Py_DECREF(PyList_GET_ITEM(list, i));
5248 PyList_SET_ITEM(list, i, item);
5249 }
5250
5251 /* Join the words to form a new string */
5252 item = PyUnicode_Join(NULL, list);
5253
5254onError:
5255 Py_DECREF(list);
5256 return (PyObject *)item;
5257}
5258#endif
5259
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005260/* Argument converter. Coerces to a single unicode character */
5261
5262static int
5263convert_uc(PyObject *obj, void *addr)
5264{
5265 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5266 PyObject *uniobj;
5267 Py_UNICODE *unistr;
5268
5269 uniobj = PyUnicode_FromObject(obj);
5270 if (uniobj == NULL) {
5271 PyErr_SetString(PyExc_TypeError,
5272 "The fill character cannot be converted to Unicode");
5273 return 0;
5274 }
5275 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5276 PyErr_SetString(PyExc_TypeError,
5277 "The fill character must be exactly one character long");
5278 Py_DECREF(uniobj);
5279 return 0;
5280 }
5281 unistr = PyUnicode_AS_UNICODE(uniobj);
5282 *fillcharloc = unistr[0];
5283 Py_DECREF(uniobj);
5284 return 1;
5285}
5286
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005287PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005288"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005290Return S centered in a Unicode string of length width. Padding is\n\
5291done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292
5293static PyObject *
5294unicode_center(PyUnicodeObject *self, PyObject *args)
5295{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005296 Py_ssize_t marg, left;
5297 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005298 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299
Thomas Woutersde017742006-02-16 19:34:37 +00005300 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 return NULL;
5302
Tim Peters7a29bd52001-09-12 03:03:31 +00005303 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 Py_INCREF(self);
5305 return (PyObject*) self;
5306 }
5307
5308 marg = width - self->length;
5309 left = marg / 2 + (marg & width & 1);
5310
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005311 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312}
5313
Marc-André Lemburge5034372000-08-08 08:04:29 +00005314#if 0
5315
5316/* This code should go into some future Unicode collation support
5317 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005318 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005319
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005320/* speedy UTF-16 code point order comparison */
5321/* gleaned from: */
5322/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5323
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005324static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005325{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005326 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005327 0, 0, 0, 0, 0, 0, 0, 0,
5328 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005329 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005330};
5331
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332static int
5333unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5334{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005335 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005336
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 Py_UNICODE *s1 = str1->str;
5338 Py_UNICODE *s2 = str2->str;
5339
5340 len1 = str1->length;
5341 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005342
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005344 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005345
5346 c1 = *s1++;
5347 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005348
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005349 if (c1 > (1<<11) * 26)
5350 c1 += utf16Fixup[c1>>11];
5351 if (c2 > (1<<11) * 26)
5352 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005353 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005354
5355 if (c1 != c2)
5356 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005357
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005358 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 }
5360
5361 return (len1 < len2) ? -1 : (len1 != len2);
5362}
5363
Marc-André Lemburge5034372000-08-08 08:04:29 +00005364#else
5365
5366static int
5367unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5368{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005369 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005370
5371 Py_UNICODE *s1 = str1->str;
5372 Py_UNICODE *s2 = str2->str;
5373
5374 len1 = str1->length;
5375 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005376
Marc-André Lemburge5034372000-08-08 08:04:29 +00005377 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005378 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005379
Fredrik Lundh45714e92001-06-26 16:39:36 +00005380 c1 = *s1++;
5381 c2 = *s2++;
5382
5383 if (c1 != c2)
5384 return (c1 < c2) ? -1 : 1;
5385
Marc-André Lemburge5034372000-08-08 08:04:29 +00005386 len1--; len2--;
5387 }
5388
5389 return (len1 < len2) ? -1 : (len1 != len2);
5390}
5391
5392#endif
5393
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394int PyUnicode_Compare(PyObject *left,
5395 PyObject *right)
5396{
5397 PyUnicodeObject *u = NULL, *v = NULL;
5398 int result;
5399
5400 /* Coerce the two arguments */
5401 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5402 if (u == NULL)
5403 goto onError;
5404 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5405 if (v == NULL)
5406 goto onError;
5407
Thomas Wouters7e474022000-07-16 12:04:32 +00005408 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 if (v == u) {
5410 Py_DECREF(u);
5411 Py_DECREF(v);
5412 return 0;
5413 }
5414
5415 result = unicode_compare(u, v);
5416
5417 Py_DECREF(u);
5418 Py_DECREF(v);
5419 return result;
5420
5421onError:
5422 Py_XDECREF(u);
5423 Py_XDECREF(v);
5424 return -1;
5425}
5426
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00005427PyObject *PyUnicode_RichCompare(PyObject *left,
5428 PyObject *right,
5429 int op)
5430{
5431 int result;
5432
5433 result = PyUnicode_Compare(left, right);
5434 if (result == -1 && PyErr_Occurred())
5435 goto onError;
5436
5437 /* Convert the return value to a Boolean */
5438 switch (op) {
5439 case Py_EQ:
5440 result = (result == 0);
5441 break;
5442 case Py_NE:
5443 result = (result != 0);
5444 break;
5445 case Py_LE:
5446 result = (result <= 0);
5447 break;
5448 case Py_GE:
5449 result = (result >= 0);
5450 break;
5451 case Py_LT:
5452 result = (result == -1);
5453 break;
5454 case Py_GT:
5455 result = (result == 1);
5456 break;
5457 }
5458 return PyBool_FromLong(result);
5459
5460 onError:
5461
5462 /* Standard case
5463
5464 Type errors mean that PyUnicode_FromObject() could not convert
5465 one of the arguments (usually the right hand side) to Unicode,
5466 ie. we can't handle the comparison request. However, it is
5467 possible that the other object knows a comparison method, which
5468 is why we return Py_NotImplemented to give the other object a
5469 chance.
5470
5471 */
5472 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5473 PyErr_Clear();
5474 Py_INCREF(Py_NotImplemented);
5475 return Py_NotImplemented;
5476 }
5477 if (op != Py_EQ && op != Py_NE)
5478 return NULL;
5479
5480 /* Equality comparison.
5481
5482 This is a special case: we silence any PyExc_UnicodeDecodeError
5483 and instead turn it into a PyErr_UnicodeWarning.
5484
5485 */
5486 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5487 return NULL;
5488 PyErr_Clear();
5489 if (PyErr_Warn(PyExc_UnicodeWarning,
5490 (op == Py_EQ) ?
5491 "Unicode equal comparison "
5492 "failed to convert both arguments to Unicode - "
5493 "interpreting them as being unequal" :
5494 "Unicode unequal comparison "
5495 "failed to convert both arguments to Unicode - "
5496 "interpreting them as being unequal"
5497 ) < 0)
5498 return NULL;
5499 result = (op == Py_NE);
5500 return PyBool_FromLong(result);
5501}
5502
Guido van Rossum403d68b2000-03-13 15:55:09 +00005503int PyUnicode_Contains(PyObject *container,
5504 PyObject *element)
5505{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005506 PyObject *str, *sub;
5507 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005508
5509 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005510 sub = PyUnicode_FromObject(element);
5511 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005512 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005513 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005514 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005515 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005516
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005517 str = PyUnicode_FromObject(container);
5518 if (!str) {
5519 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005520 return -1;
5521 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005522
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005523 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00005524
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005525 Py_DECREF(str);
5526 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00005527
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005528 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005529}
5530
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531/* Concat to string or Unicode object giving a new Unicode object. */
5532
5533PyObject *PyUnicode_Concat(PyObject *left,
5534 PyObject *right)
5535{
5536 PyUnicodeObject *u = NULL, *v = NULL, *w;
5537
5538 /* Coerce the two arguments */
5539 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5540 if (u == NULL)
5541 goto onError;
5542 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5543 if (v == NULL)
5544 goto onError;
5545
5546 /* Shortcuts */
5547 if (v == unicode_empty) {
5548 Py_DECREF(v);
5549 return (PyObject *)u;
5550 }
5551 if (u == unicode_empty) {
5552 Py_DECREF(u);
5553 return (PyObject *)v;
5554 }
5555
5556 /* Concat the two Unicode strings */
5557 w = _PyUnicode_New(u->length + v->length);
5558 if (w == NULL)
5559 goto onError;
5560 Py_UNICODE_COPY(w->str, u->str, u->length);
5561 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5562
5563 Py_DECREF(u);
5564 Py_DECREF(v);
5565 return (PyObject *)w;
5566
5567onError:
5568 Py_XDECREF(u);
5569 Py_XDECREF(v);
5570 return NULL;
5571}
5572
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005573PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574"S.count(sub[, start[, end]]) -> int\n\
5575\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005576Return the number of non-overlapping occurrences of substring sub in\n\
5577Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005578interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579
5580static PyObject *
5581unicode_count(PyUnicodeObject *self, PyObject *args)
5582{
5583 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005584 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005585 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 PyObject *result;
5587
Guido van Rossumb8872e62000-05-09 14:14:27 +00005588 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5589 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 return NULL;
5591
5592 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005593 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 if (substring == NULL)
5595 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005596
Fredrik Lundhc8162812006-05-26 19:33:03 +00005597 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005599 result = PyInt_FromSsize_t(
5600 stringlib_count(self->str + start, end - start,
5601 substring->str, substring->length)
5602 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603
5604 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005605
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 return result;
5607}
5608
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005609PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005610"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005612Encodes S using the codec registered for encoding. encoding defaults\n\
5613to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005614handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005615a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5616'xmlcharrefreplace' as well as any other name registered with\n\
5617codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618
5619static PyObject *
5620unicode_encode(PyUnicodeObject *self, PyObject *args)
5621{
5622 char *encoding = NULL;
5623 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005624 PyObject *v;
5625
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5627 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005628 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005629 if (v == NULL)
5630 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005631 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5632 PyErr_Format(PyExc_TypeError,
5633 "encoder did not return a string/unicode object "
5634 "(type=%.400s)",
5635 v->ob_type->tp_name);
5636 Py_DECREF(v);
5637 return NULL;
5638 }
5639 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005640
5641 onError:
5642 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005643}
5644
5645PyDoc_STRVAR(decode__doc__,
5646"S.decode([encoding[,errors]]) -> string or unicode\n\
5647\n\
5648Decodes S using the codec registered for encoding. encoding defaults\n\
5649to the default encoding. errors may be given to set a different error\n\
5650handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5651a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5652as well as any other name registerd with codecs.register_error that is\n\
5653able to handle UnicodeDecodeErrors.");
5654
5655static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005656unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005657{
5658 char *encoding = NULL;
5659 char *errors = NULL;
5660 PyObject *v;
5661
5662 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5663 return NULL;
5664 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005665 if (v == NULL)
5666 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005667 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5668 PyErr_Format(PyExc_TypeError,
5669 "decoder did not return a string/unicode object "
5670 "(type=%.400s)",
5671 v->ob_type->tp_name);
5672 Py_DECREF(v);
5673 return NULL;
5674 }
5675 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005676
5677 onError:
5678 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679}
5680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005681PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682"S.expandtabs([tabsize]) -> unicode\n\
5683\n\
5684Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005685If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686
5687static PyObject*
5688unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5689{
5690 Py_UNICODE *e;
5691 Py_UNICODE *p;
5692 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005693 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 PyUnicodeObject *u;
5695 int tabsize = 8;
5696
5697 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5698 return NULL;
5699
Thomas Wouters7e474022000-07-16 12:04:32 +00005700 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 i = j = 0;
5702 e = self->str + self->length;
5703 for (p = self->str; p < e; p++)
5704 if (*p == '\t') {
5705 if (tabsize > 0)
5706 j += tabsize - (j % tabsize);
5707 }
5708 else {
5709 j++;
5710 if (*p == '\n' || *p == '\r') {
5711 i += j;
5712 j = 0;
5713 }
5714 }
5715
5716 /* Second pass: create output string and fill it */
5717 u = _PyUnicode_New(i + j);
5718 if (!u)
5719 return NULL;
5720
5721 j = 0;
5722 q = u->str;
5723
5724 for (p = self->str; p < e; p++)
5725 if (*p == '\t') {
5726 if (tabsize > 0) {
5727 i = tabsize - (j % tabsize);
5728 j += i;
5729 while (i--)
5730 *q++ = ' ';
5731 }
5732 }
5733 else {
5734 j++;
5735 *q++ = *p;
5736 if (*p == '\n' || *p == '\r')
5737 j = 0;
5738 }
5739
5740 return (PyObject*) u;
5741}
5742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005743PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744"S.find(sub [,start [,end]]) -> int\n\
5745\n\
5746Return the lowest index in S where substring sub is found,\n\
5747such that sub is contained within s[start,end]. Optional\n\
5748arguments start and end are interpreted as in slice notation.\n\
5749\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005750Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751
5752static PyObject *
5753unicode_find(PyUnicodeObject *self, PyObject *args)
5754{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005755 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005756 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005757 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005758 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759
Guido van Rossumb8872e62000-05-09 14:14:27 +00005760 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5761 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005763 substring = PyUnicode_FromObject(substring);
5764 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 return NULL;
5766
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005767 result = stringlib_find_slice(
5768 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5769 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5770 start, end
5771 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772
5773 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005774
5775 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776}
5777
5778static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005779unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780{
5781 if (index < 0 || index >= self->length) {
5782 PyErr_SetString(PyExc_IndexError, "string index out of range");
5783 return NULL;
5784 }
5785
5786 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5787}
5788
5789static long
5790unicode_hash(PyUnicodeObject *self)
5791{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005792 /* Since Unicode objects compare equal to their ASCII string
5793 counterparts, they should use the individual character values
5794 as basis for their hash value. This is needed to assure that
5795 strings and Unicode objects behave in the same way as
5796 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797
Martin v. Löwis18e16552006-02-15 17:27:45 +00005798 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005799 register Py_UNICODE *p;
5800 register long x;
5801
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 if (self->hash != -1)
5803 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005804 len = PyUnicode_GET_SIZE(self);
5805 p = PyUnicode_AS_UNICODE(self);
5806 x = *p << 7;
5807 while (--len >= 0)
5808 x = (1000003*x) ^ *p++;
5809 x ^= PyUnicode_GET_SIZE(self);
5810 if (x == -1)
5811 x = -2;
5812 self->hash = x;
5813 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814}
5815
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005816PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817"S.index(sub [,start [,end]]) -> int\n\
5818\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005819Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820
5821static PyObject *
5822unicode_index(PyUnicodeObject *self, PyObject *args)
5823{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005824 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005825 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005826 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005827 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828
Guido van Rossumb8872e62000-05-09 14:14:27 +00005829 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5830 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005832 substring = PyUnicode_FromObject(substring);
5833 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834 return NULL;
5835
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005836 result = stringlib_find_slice(
5837 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5838 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5839 start, end
5840 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841
5842 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005843
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 if (result < 0) {
5845 PyErr_SetString(PyExc_ValueError, "substring not found");
5846 return NULL;
5847 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005848
Martin v. Löwis18e16552006-02-15 17:27:45 +00005849 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850}
5851
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005852PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005853"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005855Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005856at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857
5858static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005859unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860{
5861 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5862 register const Py_UNICODE *e;
5863 int cased;
5864
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 /* Shortcut for single character strings */
5866 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005867 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005869 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005870 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005871 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005872
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 e = p + PyUnicode_GET_SIZE(self);
5874 cased = 0;
5875 for (; p < e; p++) {
5876 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005877
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005879 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880 else if (!cased && Py_UNICODE_ISLOWER(ch))
5881 cased = 1;
5882 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005883 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884}
5885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005886PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005887"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005889Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005890at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891
5892static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005893unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894{
5895 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5896 register const Py_UNICODE *e;
5897 int cased;
5898
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899 /* Shortcut for single character strings */
5900 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005901 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005903 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005904 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005905 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005906
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 e = p + PyUnicode_GET_SIZE(self);
5908 cased = 0;
5909 for (; p < e; p++) {
5910 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005911
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005913 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 else if (!cased && Py_UNICODE_ISUPPER(ch))
5915 cased = 1;
5916 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005917 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918}
5919
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005920PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005921"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005923Return True if S is a titlecased string and there is at least one\n\
5924character in S, i.e. upper- and titlecase characters may only\n\
5925follow uncased characters and lowercase characters only cased ones.\n\
5926Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927
5928static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005929unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930{
5931 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5932 register const Py_UNICODE *e;
5933 int cased, previous_is_cased;
5934
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 /* Shortcut for single character strings */
5936 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005937 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5938 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005940 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005941 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005942 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005943
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 e = p + PyUnicode_GET_SIZE(self);
5945 cased = 0;
5946 previous_is_cased = 0;
5947 for (; p < e; p++) {
5948 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005949
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5951 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005952 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 previous_is_cased = 1;
5954 cased = 1;
5955 }
5956 else if (Py_UNICODE_ISLOWER(ch)) {
5957 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005958 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 previous_is_cased = 1;
5960 cased = 1;
5961 }
5962 else
5963 previous_is_cased = 0;
5964 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005965 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966}
5967
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005968PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005969"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005971Return True if all characters in S are whitespace\n\
5972and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973
5974static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005975unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976{
5977 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5978 register const Py_UNICODE *e;
5979
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 /* Shortcut for single character strings */
5981 if (PyUnicode_GET_SIZE(self) == 1 &&
5982 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005983 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005985 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005986 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005987 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005988
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 e = p + PyUnicode_GET_SIZE(self);
5990 for (; p < e; p++) {
5991 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005992 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005994 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995}
5996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005997PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005998"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005999\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006000Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006001and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006002
6003static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006004unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006005{
6006 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6007 register const Py_UNICODE *e;
6008
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006009 /* Shortcut for single character strings */
6010 if (PyUnicode_GET_SIZE(self) == 1 &&
6011 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006012 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006013
6014 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006015 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006016 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006017
6018 e = p + PyUnicode_GET_SIZE(self);
6019 for (; p < e; p++) {
6020 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006021 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006022 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006023 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006024}
6025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006026PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006027"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006028\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006029Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006030and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006031
6032static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006033unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006034{
6035 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6036 register const Py_UNICODE *e;
6037
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006038 /* Shortcut for single character strings */
6039 if (PyUnicode_GET_SIZE(self) == 1 &&
6040 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006041 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006042
6043 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006044 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006045 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006046
6047 e = p + PyUnicode_GET_SIZE(self);
6048 for (; p < e; p++) {
6049 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006050 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006051 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006052 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006053}
6054
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006055PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006056"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006058Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006059False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060
6061static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006062unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063{
6064 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6065 register const Py_UNICODE *e;
6066
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 /* Shortcut for single character strings */
6068 if (PyUnicode_GET_SIZE(self) == 1 &&
6069 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006070 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006072 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006073 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006074 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006075
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 e = p + PyUnicode_GET_SIZE(self);
6077 for (; p < e; p++) {
6078 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006079 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006081 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082}
6083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006084PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006085"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006087Return True if all characters in S are digits\n\
6088and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089
6090static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006091unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092{
6093 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6094 register const Py_UNICODE *e;
6095
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 /* Shortcut for single character strings */
6097 if (PyUnicode_GET_SIZE(self) == 1 &&
6098 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006099 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006101 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006102 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006103 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006104
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 e = p + PyUnicode_GET_SIZE(self);
6106 for (; p < e; p++) {
6107 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006108 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006110 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111}
6112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006113PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006114"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006116Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006117False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118
6119static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006120unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121{
6122 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6123 register const Py_UNICODE *e;
6124
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 /* Shortcut for single character strings */
6126 if (PyUnicode_GET_SIZE(self) == 1 &&
6127 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006128 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006130 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006131 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006132 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006133
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 e = p + PyUnicode_GET_SIZE(self);
6135 for (; p < e; p++) {
6136 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006137 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006139 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140}
6141
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006142PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143"S.join(sequence) -> unicode\n\
6144\n\
6145Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006146sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147
6148static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006149unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006151 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152}
6153
Martin v. Löwis18e16552006-02-15 17:27:45 +00006154static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155unicode_length(PyUnicodeObject *self)
6156{
6157 return self->length;
6158}
6159
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006160PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006161"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162\n\
6163Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006164done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165
6166static PyObject *
6167unicode_ljust(PyUnicodeObject *self, PyObject *args)
6168{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006169 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006170 Py_UNICODE fillchar = ' ';
6171
Martin v. Löwis412fb672006-04-13 06:34:32 +00006172 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 return NULL;
6174
Tim Peters7a29bd52001-09-12 03:03:31 +00006175 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 Py_INCREF(self);
6177 return (PyObject*) self;
6178 }
6179
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006180 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181}
6182
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006183PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184"S.lower() -> unicode\n\
6185\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006186Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187
6188static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006189unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191 return fixup(self, fixlower);
6192}
6193
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006194#define LEFTSTRIP 0
6195#define RIGHTSTRIP 1
6196#define BOTHSTRIP 2
6197
6198/* Arrays indexed by above */
6199static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6200
6201#define STRIPNAME(i) (stripformat[i]+3)
6202
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006203/* externally visible for str.strip(unicode) */
6204PyObject *
6205_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6206{
6207 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006208 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006209 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006210 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6211 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006212
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006213 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6214
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006215 i = 0;
6216 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006217 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6218 i++;
6219 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006220 }
6221
6222 j = len;
6223 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006224 do {
6225 j--;
6226 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6227 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006228 }
6229
6230 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006231 Py_INCREF(self);
6232 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006233 }
6234 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006235 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006236}
6237
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238
6239static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006240do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006242 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006243 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006244
6245 i = 0;
6246 if (striptype != RIGHTSTRIP) {
6247 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6248 i++;
6249 }
6250 }
6251
6252 j = len;
6253 if (striptype != LEFTSTRIP) {
6254 do {
6255 j--;
6256 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6257 j++;
6258 }
6259
6260 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6261 Py_INCREF(self);
6262 return (PyObject*)self;
6263 }
6264 else
6265 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266}
6267
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006268
6269static PyObject *
6270do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6271{
6272 PyObject *sep = NULL;
6273
6274 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6275 return NULL;
6276
6277 if (sep != NULL && sep != Py_None) {
6278 if (PyUnicode_Check(sep))
6279 return _PyUnicode_XStrip(self, striptype, sep);
6280 else if (PyString_Check(sep)) {
6281 PyObject *res;
6282 sep = PyUnicode_FromObject(sep);
6283 if (sep==NULL)
6284 return NULL;
6285 res = _PyUnicode_XStrip(self, striptype, sep);
6286 Py_DECREF(sep);
6287 return res;
6288 }
6289 else {
6290 PyErr_Format(PyExc_TypeError,
6291 "%s arg must be None, unicode or str",
6292 STRIPNAME(striptype));
6293 return NULL;
6294 }
6295 }
6296
6297 return do_strip(self, striptype);
6298}
6299
6300
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006301PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006302"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006303\n\
6304Return a copy of the string S with leading and trailing\n\
6305whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006306If chars is given and not None, remove characters in chars instead.\n\
6307If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006308
6309static PyObject *
6310unicode_strip(PyUnicodeObject *self, PyObject *args)
6311{
6312 if (PyTuple_GET_SIZE(args) == 0)
6313 return do_strip(self, BOTHSTRIP); /* Common case */
6314 else
6315 return do_argstrip(self, BOTHSTRIP, args);
6316}
6317
6318
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006319PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006320"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006321\n\
6322Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006323If chars is given and not None, remove characters in chars instead.\n\
6324If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006325
6326static PyObject *
6327unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6328{
6329 if (PyTuple_GET_SIZE(args) == 0)
6330 return do_strip(self, LEFTSTRIP); /* Common case */
6331 else
6332 return do_argstrip(self, LEFTSTRIP, args);
6333}
6334
6335
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006336PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006337"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006338\n\
6339Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006340If chars is given and not None, remove characters in chars instead.\n\
6341If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006342
6343static PyObject *
6344unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6345{
6346 if (PyTuple_GET_SIZE(args) == 0)
6347 return do_strip(self, RIGHTSTRIP); /* Common case */
6348 else
6349 return do_argstrip(self, RIGHTSTRIP, args);
6350}
6351
6352
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006354unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355{
6356 PyUnicodeObject *u;
6357 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006358 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006359 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360
6361 if (len < 0)
6362 len = 0;
6363
Tim Peters7a29bd52001-09-12 03:03:31 +00006364 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365 /* no repeat, return original string */
6366 Py_INCREF(str);
6367 return (PyObject*) str;
6368 }
Tim Peters8f422462000-09-09 06:13:41 +00006369
6370 /* ensure # of chars needed doesn't overflow int and # of bytes
6371 * needed doesn't overflow size_t
6372 */
6373 nchars = len * str->length;
6374 if (len && nchars / len != str->length) {
6375 PyErr_SetString(PyExc_OverflowError,
6376 "repeated string is too long");
6377 return NULL;
6378 }
6379 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6380 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6381 PyErr_SetString(PyExc_OverflowError,
6382 "repeated string is too long");
6383 return NULL;
6384 }
6385 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386 if (!u)
6387 return NULL;
6388
6389 p = u->str;
6390
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006391 if (str->length == 1 && len > 0) {
6392 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006393 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006394 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006395 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006396 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006397 done = str->length;
6398 }
6399 while (done < nchars) {
6400 int n = (done <= nchars-done) ? done : nchars-done;
6401 Py_UNICODE_COPY(p+done, p, n);
6402 done += n;
6403 }
6404 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405
6406 return (PyObject*) u;
6407}
6408
6409PyObject *PyUnicode_Replace(PyObject *obj,
6410 PyObject *subobj,
6411 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006412 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413{
6414 PyObject *self;
6415 PyObject *str1;
6416 PyObject *str2;
6417 PyObject *result;
6418
6419 self = PyUnicode_FromObject(obj);
6420 if (self == NULL)
6421 return NULL;
6422 str1 = PyUnicode_FromObject(subobj);
6423 if (str1 == NULL) {
6424 Py_DECREF(self);
6425 return NULL;
6426 }
6427 str2 = PyUnicode_FromObject(replobj);
6428 if (str2 == NULL) {
6429 Py_DECREF(self);
6430 Py_DECREF(str1);
6431 return NULL;
6432 }
Tim Petersced69f82003-09-16 20:30:58 +00006433 result = replace((PyUnicodeObject *)self,
6434 (PyUnicodeObject *)str1,
6435 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 maxcount);
6437 Py_DECREF(self);
6438 Py_DECREF(str1);
6439 Py_DECREF(str2);
6440 return result;
6441}
6442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006443PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444"S.replace (old, new[, maxsplit]) -> unicode\n\
6445\n\
6446Return a copy of S with all occurrences of substring\n\
6447old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006448given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449
6450static PyObject*
6451unicode_replace(PyUnicodeObject *self, PyObject *args)
6452{
6453 PyUnicodeObject *str1;
6454 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006455 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 PyObject *result;
6457
Martin v. Löwis18e16552006-02-15 17:27:45 +00006458 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 return NULL;
6460 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6461 if (str1 == NULL)
6462 return NULL;
6463 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006464 if (str2 == NULL) {
6465 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006467 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468
6469 result = replace(self, str1, str2, maxcount);
6470
6471 Py_DECREF(str1);
6472 Py_DECREF(str2);
6473 return result;
6474}
6475
6476static
6477PyObject *unicode_repr(PyObject *unicode)
6478{
6479 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6480 PyUnicode_GET_SIZE(unicode),
6481 1);
6482}
6483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006484PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485"S.rfind(sub [,start [,end]]) -> int\n\
6486\n\
6487Return the highest index in S where substring sub is found,\n\
6488such that sub is contained within s[start,end]. Optional\n\
6489arguments start and end are interpreted as in slice notation.\n\
6490\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006491Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492
6493static PyObject *
6494unicode_rfind(PyUnicodeObject *self, PyObject *args)
6495{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006496 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006497 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006498 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006499 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500
Guido van Rossumb8872e62000-05-09 14:14:27 +00006501 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6502 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006504 substring = PyUnicode_FromObject(substring);
6505 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 return NULL;
6507
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006508 result = stringlib_rfind_slice(
6509 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6510 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6511 start, end
6512 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513
6514 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006515
6516 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517}
6518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006519PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520"S.rindex(sub [,start [,end]]) -> int\n\
6521\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006522Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523
6524static PyObject *
6525unicode_rindex(PyUnicodeObject *self, PyObject *args)
6526{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006527 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006528 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006529 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006530 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531
Guido van Rossumb8872e62000-05-09 14:14:27 +00006532 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6533 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006535 substring = PyUnicode_FromObject(substring);
6536 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537 return NULL;
6538
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006539 result = stringlib_rfind_slice(
6540 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6541 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6542 start, end
6543 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544
6545 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006546
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 if (result < 0) {
6548 PyErr_SetString(PyExc_ValueError, "substring not found");
6549 return NULL;
6550 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006551 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552}
6553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006554PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006555"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556\n\
6557Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006558done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559
6560static PyObject *
6561unicode_rjust(PyUnicodeObject *self, PyObject *args)
6562{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006563 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006564 Py_UNICODE fillchar = ' ';
6565
Martin v. Löwis412fb672006-04-13 06:34:32 +00006566 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 return NULL;
6568
Tim Peters7a29bd52001-09-12 03:03:31 +00006569 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 Py_INCREF(self);
6571 return (PyObject*) self;
6572 }
6573
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006574 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575}
6576
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006578unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579{
6580 /* standard clamping */
6581 if (start < 0)
6582 start = 0;
6583 if (end < 0)
6584 end = 0;
6585 if (end > self->length)
6586 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006587 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 /* full slice, return original string */
6589 Py_INCREF(self);
6590 return (PyObject*) self;
6591 }
6592 if (start > end)
6593 start = end;
6594 /* copy slice */
6595 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6596 end - start);
6597}
6598
6599PyObject *PyUnicode_Split(PyObject *s,
6600 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006601 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602{
6603 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006604
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 s = PyUnicode_FromObject(s);
6606 if (s == NULL)
6607 return NULL;
6608 if (sep != NULL) {
6609 sep = PyUnicode_FromObject(sep);
6610 if (sep == NULL) {
6611 Py_DECREF(s);
6612 return NULL;
6613 }
6614 }
6615
6616 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6617
6618 Py_DECREF(s);
6619 Py_XDECREF(sep);
6620 return result;
6621}
6622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006623PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624"S.split([sep [,maxsplit]]) -> list of strings\n\
6625\n\
6626Return a list of the words in S, using sep as the\n\
6627delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006628splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006629any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630
6631static PyObject*
6632unicode_split(PyUnicodeObject *self, PyObject *args)
6633{
6634 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006635 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636
Martin v. Löwis18e16552006-02-15 17:27:45 +00006637 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638 return NULL;
6639
6640 if (substring == Py_None)
6641 return split(self, NULL, maxcount);
6642 else if (PyUnicode_Check(substring))
6643 return split(self, (PyUnicodeObject *)substring, maxcount);
6644 else
6645 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6646}
6647
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006648PyObject *
6649PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6650{
6651 PyObject* str_obj;
6652 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006653 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00006654
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006655 str_obj = PyUnicode_FromObject(str_in);
6656 if (!str_obj)
6657 return NULL;
6658 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00006659 if (!sep_obj) {
6660 Py_DECREF(str_obj);
6661 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006662 }
6663
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006664 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00006665 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6666 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6667 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006668
Fredrik Lundhb9479482006-05-26 17:22:38 +00006669 Py_DECREF(sep_obj);
6670 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006671
6672 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006673}
6674
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006675
6676PyObject *
6677PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6678{
6679 PyObject* str_obj;
6680 PyObject* sep_obj;
6681 PyObject* out;
6682
6683 str_obj = PyUnicode_FromObject(str_in);
6684 if (!str_obj)
6685 return NULL;
6686 sep_obj = PyUnicode_FromObject(sep_in);
6687 if (!sep_obj) {
6688 Py_DECREF(str_obj);
6689 return NULL;
6690 }
6691
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006692 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006693 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6694 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6695 );
6696
6697 Py_DECREF(sep_obj);
6698 Py_DECREF(str_obj);
6699
6700 return out;
6701}
6702
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006703PyDoc_STRVAR(partition__doc__,
6704"S.partition(sep) -> (head, sep, tail)\n\
6705\n\
6706Searches for the separator sep in S, and returns the part before it,\n\
6707the separator itself, and the part after it. If the separator is not\n\
6708found, returns S and two empty strings.");
6709
6710static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00006711unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006712{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006713 return PyUnicode_Partition((PyObject *)self, separator);
6714}
6715
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006716PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00006717"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006718\n\
6719Searches for the separator sep in S, starting at the end of S, and returns\n\
6720the part before it, the separator itself, and the part after it. If the\n\
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00006721separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006722
6723static PyObject*
6724unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6725{
6726 return PyUnicode_RPartition((PyObject *)self, separator);
6727}
6728
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006729PyObject *PyUnicode_RSplit(PyObject *s,
6730 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006731 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006732{
6733 PyObject *result;
6734
6735 s = PyUnicode_FromObject(s);
6736 if (s == NULL)
6737 return NULL;
6738 if (sep != NULL) {
6739 sep = PyUnicode_FromObject(sep);
6740 if (sep == NULL) {
6741 Py_DECREF(s);
6742 return NULL;
6743 }
6744 }
6745
6746 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6747
6748 Py_DECREF(s);
6749 Py_XDECREF(sep);
6750 return result;
6751}
6752
6753PyDoc_STRVAR(rsplit__doc__,
6754"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6755\n\
6756Return a list of the words in S, using sep as the\n\
6757delimiter string, starting at the end of the string and\n\
6758working to the front. If maxsplit is given, at most maxsplit\n\
6759splits are done. If sep is not specified, any whitespace string\n\
6760is a separator.");
6761
6762static PyObject*
6763unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6764{
6765 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006766 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006767
Martin v. Löwis18e16552006-02-15 17:27:45 +00006768 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006769 return NULL;
6770
6771 if (substring == Py_None)
6772 return rsplit(self, NULL, maxcount);
6773 else if (PyUnicode_Check(substring))
6774 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6775 else
6776 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6777}
6778
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006779PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006780"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781\n\
6782Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006783Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006784is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785
6786static PyObject*
6787unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6788{
Guido van Rossum86662912000-04-11 15:38:46 +00006789 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790
Guido van Rossum86662912000-04-11 15:38:46 +00006791 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 return NULL;
6793
Guido van Rossum86662912000-04-11 15:38:46 +00006794 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795}
6796
6797static
6798PyObject *unicode_str(PyUnicodeObject *self)
6799{
Fred Drakee4315f52000-05-09 19:53:39 +00006800 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801}
6802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006803PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804"S.swapcase() -> unicode\n\
6805\n\
6806Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006807and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808
6809static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006810unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 return fixup(self, fixswapcase);
6813}
6814
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006815PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816"S.translate(table) -> unicode\n\
6817\n\
6818Return a copy of the string S, where all characters have been mapped\n\
6819through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006820Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6821Unmapped characters are left untouched. Characters mapped to None\n\
6822are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823
6824static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006825unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826{
Tim Petersced69f82003-09-16 20:30:58 +00006827 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006829 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830 "ignore");
6831}
6832
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006833PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834"S.upper() -> unicode\n\
6835\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006836Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837
6838static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006839unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841 return fixup(self, fixupper);
6842}
6843
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006844PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845"S.zfill(width) -> unicode\n\
6846\n\
6847Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006848of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849
6850static PyObject *
6851unicode_zfill(PyUnicodeObject *self, PyObject *args)
6852{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006853 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 PyUnicodeObject *u;
6855
Martin v. Löwis18e16552006-02-15 17:27:45 +00006856 Py_ssize_t width;
6857 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 return NULL;
6859
6860 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006861 if (PyUnicode_CheckExact(self)) {
6862 Py_INCREF(self);
6863 return (PyObject*) self;
6864 }
6865 else
6866 return PyUnicode_FromUnicode(
6867 PyUnicode_AS_UNICODE(self),
6868 PyUnicode_GET_SIZE(self)
6869 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 }
6871
6872 fill = width - self->length;
6873
6874 u = pad(self, fill, 0, '0');
6875
Walter Dörwald068325e2002-04-15 13:36:47 +00006876 if (u == NULL)
6877 return NULL;
6878
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 if (u->str[fill] == '+' || u->str[fill] == '-') {
6880 /* move sign to beginning of string */
6881 u->str[0] = u->str[fill];
6882 u->str[fill] = '0';
6883 }
6884
6885 return (PyObject*) u;
6886}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887
6888#if 0
6889static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006890unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 return PyInt_FromLong(unicode_freelist_size);
6893}
6894#endif
6895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006896PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006897"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006899Return True if S starts with the specified prefix, False otherwise.\n\
6900With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00006901With optional end, stop comparing S at that position.\n\
6902prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903
6904static PyObject *
6905unicode_startswith(PyUnicodeObject *self,
6906 PyObject *args)
6907{
Georg Brandl24250812006-06-09 18:45:48 +00006908 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006910 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006911 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00006912 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913
Georg Brandl24250812006-06-09 18:45:48 +00006914 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00006915 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00006917 if (PyTuple_Check(subobj)) {
6918 Py_ssize_t i;
6919 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6920 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6921 PyTuple_GET_ITEM(subobj, i));
6922 if (substring == NULL)
6923 return NULL;
6924 result = tailmatch(self, substring, start, end, -1);
6925 Py_DECREF(substring);
6926 if (result) {
6927 Py_RETURN_TRUE;
6928 }
6929 }
6930 /* nothing matched */
6931 Py_RETURN_FALSE;
6932 }
6933 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00006935 return NULL;
6936 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00006938 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939}
6940
6941
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006942PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006943"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006945Return True if S ends with the specified suffix, False otherwise.\n\
6946With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00006947With optional end, stop comparing S at that position.\n\
6948suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949
6950static PyObject *
6951unicode_endswith(PyUnicodeObject *self,
6952 PyObject *args)
6953{
Georg Brandl24250812006-06-09 18:45:48 +00006954 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006956 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006957 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00006958 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959
Georg Brandl24250812006-06-09 18:45:48 +00006960 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
6961 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00006963 if (PyTuple_Check(subobj)) {
6964 Py_ssize_t i;
6965 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6966 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6967 PyTuple_GET_ITEM(subobj, i));
6968 if (substring == NULL)
6969 return NULL;
6970 result = tailmatch(self, substring, start, end, +1);
6971 Py_DECREF(substring);
6972 if (result) {
6973 Py_RETURN_TRUE;
6974 }
6975 }
6976 Py_RETURN_FALSE;
6977 }
6978 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00006980 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981
Georg Brandl24250812006-06-09 18:45:48 +00006982 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00006984 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985}
6986
6987
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006988
6989static PyObject *
6990unicode_getnewargs(PyUnicodeObject *v)
6991{
6992 return Py_BuildValue("(u#)", v->str, v->length);
6993}
6994
6995
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996static PyMethodDef unicode_methods[] = {
6997
6998 /* Order is according to common usage: often used methods should
6999 appear first, since lookup is done sequentially. */
7000
Georg Brandlecdc0a92006-03-30 12:19:07 +00007001 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007002 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7003 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007004 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007005 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7006 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7007 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7008 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7009 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7010 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7011 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007012 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007013 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7014 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7015 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007016 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007017 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007018/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7019 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7020 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7021 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007022 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007023 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007024 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007025 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007026 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7027 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7028 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7029 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7030 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7031 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7032 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7033 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7034 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7035 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7036 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7037 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7038 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7039 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007040 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007041#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007042 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043#endif
7044
7045#if 0
7046 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007047 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048#endif
7049
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007050 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051 {NULL, NULL}
7052};
7053
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007054static PyObject *
7055unicode_mod(PyObject *v, PyObject *w)
7056{
7057 if (!PyUnicode_Check(v)) {
7058 Py_INCREF(Py_NotImplemented);
7059 return Py_NotImplemented;
7060 }
7061 return PyUnicode_Format(v, w);
7062}
7063
7064static PyNumberMethods unicode_as_number = {
7065 0, /*nb_add*/
7066 0, /*nb_subtract*/
7067 0, /*nb_multiply*/
7068 0, /*nb_divide*/
7069 unicode_mod, /*nb_remainder*/
7070};
7071
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007073 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007074 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007075 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7076 (ssizeargfunc) unicode_getitem, /* sq_item */
7077 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078 0, /* sq_ass_item */
7079 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007080 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081};
7082
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007083static PyObject*
7084unicode_subscript(PyUnicodeObject* self, PyObject* item)
7085{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007086 if (PyIndex_Check(item)) {
7087 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007088 if (i == -1 && PyErr_Occurred())
7089 return NULL;
7090 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007091 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007092 return unicode_getitem(self, i);
7093 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007094 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007095 Py_UNICODE* source_buf;
7096 Py_UNICODE* result_buf;
7097 PyObject* result;
7098
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007099 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007100 &start, &stop, &step, &slicelength) < 0) {
7101 return NULL;
7102 }
7103
7104 if (slicelength <= 0) {
7105 return PyUnicode_FromUnicode(NULL, 0);
7106 } else {
7107 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00007108 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7109 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007110
7111 if (result_buf == NULL)
7112 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007113
7114 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7115 result_buf[i] = source_buf[cur];
7116 }
Tim Petersced69f82003-09-16 20:30:58 +00007117
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007118 result = PyUnicode_FromUnicode(result_buf, slicelength);
7119 PyMem_FREE(result_buf);
7120 return result;
7121 }
7122 } else {
7123 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7124 return NULL;
7125 }
7126}
7127
7128static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007129 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007130 (binaryfunc)unicode_subscript, /* mp_subscript */
7131 (objobjargproc)0, /* mp_ass_subscript */
7132};
7133
Martin v. Löwis18e16552006-02-15 17:27:45 +00007134static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007136 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 const void **ptr)
7138{
7139 if (index != 0) {
7140 PyErr_SetString(PyExc_SystemError,
7141 "accessing non-existent unicode segment");
7142 return -1;
7143 }
7144 *ptr = (void *) self->str;
7145 return PyUnicode_GET_DATA_SIZE(self);
7146}
7147
Martin v. Löwis18e16552006-02-15 17:27:45 +00007148static Py_ssize_t
7149unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150 const void **ptr)
7151{
7152 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007153 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154 return -1;
7155}
7156
7157static int
7158unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007159 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160{
7161 if (lenp)
7162 *lenp = PyUnicode_GET_DATA_SIZE(self);
7163 return 1;
7164}
7165
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007166static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007168 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169 const void **ptr)
7170{
7171 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007172
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173 if (index != 0) {
7174 PyErr_SetString(PyExc_SystemError,
7175 "accessing non-existent unicode segment");
7176 return -1;
7177 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007178 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179 if (str == NULL)
7180 return -1;
7181 *ptr = (void *) PyString_AS_STRING(str);
7182 return PyString_GET_SIZE(str);
7183}
7184
7185/* Helpers for PyUnicode_Format() */
7186
7187static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007188getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007190 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191 if (argidx < arglen) {
7192 (*p_argidx)++;
7193 if (arglen < 0)
7194 return args;
7195 else
7196 return PyTuple_GetItem(args, argidx);
7197 }
7198 PyErr_SetString(PyExc_TypeError,
7199 "not enough arguments for format string");
7200 return NULL;
7201}
7202
7203#define F_LJUST (1<<0)
7204#define F_SIGN (1<<1)
7205#define F_BLANK (1<<2)
7206#define F_ALT (1<<3)
7207#define F_ZERO (1<<4)
7208
Martin v. Löwis18e16552006-02-15 17:27:45 +00007209static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007210strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007212 register Py_ssize_t i;
7213 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214 for (i = len - 1; i >= 0; i--)
7215 buffer[i] = (Py_UNICODE) charbuffer[i];
7216
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217 return len;
7218}
7219
Neal Norwitzfc76d632006-01-10 06:03:13 +00007220static int
7221doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7222{
Tim Peters15231542006-02-16 01:08:01 +00007223 Py_ssize_t result;
7224
Neal Norwitzfc76d632006-01-10 06:03:13 +00007225 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007226 result = strtounicode(buffer, (char *)buffer);
7227 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007228}
7229
7230static int
7231longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7232{
Tim Peters15231542006-02-16 01:08:01 +00007233 Py_ssize_t result;
7234
Neal Norwitzfc76d632006-01-10 06:03:13 +00007235 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007236 result = strtounicode(buffer, (char *)buffer);
7237 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007238}
7239
Guido van Rossum078151d2002-08-11 04:24:12 +00007240/* XXX To save some code duplication, formatfloat/long/int could have been
7241 shared with stringobject.c, converting from 8-bit to Unicode after the
7242 formatting is done. */
7243
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244static int
7245formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007246 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247 int flags,
7248 int prec,
7249 int type,
7250 PyObject *v)
7251{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007252 /* fmt = '%#.' + `prec` + `type`
7253 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254 char fmt[20];
7255 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007256
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257 x = PyFloat_AsDouble(v);
7258 if (x == -1.0 && PyErr_Occurred())
7259 return -1;
7260 if (prec < 0)
7261 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7263 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007264 /* Worst case length calc to ensure no buffer overrun:
7265
7266 'g' formats:
7267 fmt = %#.<prec>g
7268 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7269 for any double rep.)
7270 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7271
7272 'f' formats:
7273 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7274 len = 1 + 50 + 1 + prec = 52 + prec
7275
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007276 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007277 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007278
7279 */
7280 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7281 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007282 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007283 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007284 return -1;
7285 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007286 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7287 (flags&F_ALT) ? "#" : "",
7288 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007289 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290}
7291
Tim Peters38fd5b62000-09-21 05:43:11 +00007292static PyObject*
7293formatlong(PyObject *val, int flags, int prec, int type)
7294{
7295 char *buf;
7296 int i, len;
7297 PyObject *str; /* temporary string object. */
7298 PyUnicodeObject *result;
7299
7300 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7301 if (!str)
7302 return NULL;
7303 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007304 if (!result) {
7305 Py_DECREF(str);
7306 return NULL;
7307 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007308 for (i = 0; i < len; i++)
7309 result->str[i] = buf[i];
7310 result->str[len] = 0;
7311 Py_DECREF(str);
7312 return (PyObject*)result;
7313}
7314
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315static int
7316formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007317 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318 int flags,
7319 int prec,
7320 int type,
7321 PyObject *v)
7322{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007323 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007324 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7325 * + 1 + 1
7326 * = 24
7327 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007328 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007329 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 long x;
7331
7332 x = PyInt_AsLong(v);
7333 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007334 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007335 if (x < 0 && type == 'u') {
7336 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007337 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007338 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7339 sign = "-";
7340 else
7341 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007343 prec = 1;
7344
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007345 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7346 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007347 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007348 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007349 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007350 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007351 return -1;
7352 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007353
7354 if ((flags & F_ALT) &&
7355 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007356 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007357 * of issues that cause pain:
7358 * - when 0 is being converted, the C standard leaves off
7359 * the '0x' or '0X', which is inconsistent with other
7360 * %#x/%#X conversions and inconsistent with Python's
7361 * hex() function
7362 * - there are platforms that violate the standard and
7363 * convert 0 with the '0x' or '0X'
7364 * (Metrowerks, Compaq Tru64)
7365 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007366 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007367 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007368 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007369 * We can achieve the desired consistency by inserting our
7370 * own '0x' or '0X' prefix, and substituting %x/%X in place
7371 * of %#x/%#X.
7372 *
7373 * Note that this is the same approach as used in
7374 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007375 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007376 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7377 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007378 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007379 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007380 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7381 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007382 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007383 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007384 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007385 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007386 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007387 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388}
7389
7390static int
7391formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007392 size_t buflen,
7393 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007395 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007396 if (PyUnicode_Check(v)) {
7397 if (PyUnicode_GET_SIZE(v) != 1)
7398 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007400 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007402 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007403 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007404 goto onError;
7405 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7406 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407
7408 else {
7409 /* Integer input truncated to a character */
7410 long x;
7411 x = PyInt_AsLong(v);
7412 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007413 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007414#ifdef Py_UNICODE_WIDE
7415 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007416 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007417 "%c arg not in range(0x110000) "
7418 "(wide Python build)");
7419 return -1;
7420 }
7421#else
7422 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007423 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007424 "%c arg not in range(0x10000) "
7425 "(narrow Python build)");
7426 return -1;
7427 }
7428#endif
7429 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430 }
7431 buf[1] = '\0';
7432 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007433
7434 onError:
7435 PyErr_SetString(PyExc_TypeError,
7436 "%c requires int or char");
7437 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438}
7439
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007440/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7441
7442 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7443 chars are formatted. XXX This is a magic number. Each formatting
7444 routine does bounds checking to ensure no overflow, but a better
7445 solution may be to malloc a buffer of appropriate size for each
7446 format. For now, the current solution is sufficient.
7447*/
7448#define FORMATBUFLEN (size_t)120
7449
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450PyObject *PyUnicode_Format(PyObject *format,
7451 PyObject *args)
7452{
7453 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007454 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455 int args_owned = 0;
7456 PyUnicodeObject *result = NULL;
7457 PyObject *dict = NULL;
7458 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007459
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460 if (format == NULL || args == NULL) {
7461 PyErr_BadInternalCall();
7462 return NULL;
7463 }
7464 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007465 if (uformat == NULL)
7466 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467 fmt = PyUnicode_AS_UNICODE(uformat);
7468 fmtcnt = PyUnicode_GET_SIZE(uformat);
7469
7470 reslen = rescnt = fmtcnt + 100;
7471 result = _PyUnicode_New(reslen);
7472 if (result == NULL)
7473 goto onError;
7474 res = PyUnicode_AS_UNICODE(result);
7475
7476 if (PyTuple_Check(args)) {
7477 arglen = PyTuple_Size(args);
7478 argidx = 0;
7479 }
7480 else {
7481 arglen = -1;
7482 argidx = -2;
7483 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007484 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7485 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486 dict = args;
7487
7488 while (--fmtcnt >= 0) {
7489 if (*fmt != '%') {
7490 if (--rescnt < 0) {
7491 rescnt = fmtcnt + 100;
7492 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007493 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007494 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7496 --rescnt;
7497 }
7498 *res++ = *fmt++;
7499 }
7500 else {
7501 /* Got a format specifier */
7502 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007503 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505 Py_UNICODE c = '\0';
7506 Py_UNICODE fill;
7507 PyObject *v = NULL;
7508 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007509 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007511 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007512 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513
7514 fmt++;
7515 if (*fmt == '(') {
7516 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007517 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518 PyObject *key;
7519 int pcount = 1;
7520
7521 if (dict == NULL) {
7522 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007523 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524 goto onError;
7525 }
7526 ++fmt;
7527 --fmtcnt;
7528 keystart = fmt;
7529 /* Skip over balanced parentheses */
7530 while (pcount > 0 && --fmtcnt >= 0) {
7531 if (*fmt == ')')
7532 --pcount;
7533 else if (*fmt == '(')
7534 ++pcount;
7535 fmt++;
7536 }
7537 keylen = fmt - keystart - 1;
7538 if (fmtcnt < 0 || pcount > 0) {
7539 PyErr_SetString(PyExc_ValueError,
7540 "incomplete format key");
7541 goto onError;
7542 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007543#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007544 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545 then looked up since Python uses strings to hold
7546 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007547 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548 key = PyUnicode_EncodeUTF8(keystart,
7549 keylen,
7550 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007551#else
7552 key = PyUnicode_FromUnicode(keystart, keylen);
7553#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554 if (key == NULL)
7555 goto onError;
7556 if (args_owned) {
7557 Py_DECREF(args);
7558 args_owned = 0;
7559 }
7560 args = PyObject_GetItem(dict, key);
7561 Py_DECREF(key);
7562 if (args == NULL) {
7563 goto onError;
7564 }
7565 args_owned = 1;
7566 arglen = -1;
7567 argidx = -2;
7568 }
7569 while (--fmtcnt >= 0) {
7570 switch (c = *fmt++) {
7571 case '-': flags |= F_LJUST; continue;
7572 case '+': flags |= F_SIGN; continue;
7573 case ' ': flags |= F_BLANK; continue;
7574 case '#': flags |= F_ALT; continue;
7575 case '0': flags |= F_ZERO; continue;
7576 }
7577 break;
7578 }
7579 if (c == '*') {
7580 v = getnextarg(args, arglen, &argidx);
7581 if (v == NULL)
7582 goto onError;
7583 if (!PyInt_Check(v)) {
7584 PyErr_SetString(PyExc_TypeError,
7585 "* wants int");
7586 goto onError;
7587 }
7588 width = PyInt_AsLong(v);
7589 if (width < 0) {
7590 flags |= F_LJUST;
7591 width = -width;
7592 }
7593 if (--fmtcnt >= 0)
7594 c = *fmt++;
7595 }
7596 else if (c >= '0' && c <= '9') {
7597 width = c - '0';
7598 while (--fmtcnt >= 0) {
7599 c = *fmt++;
7600 if (c < '0' || c > '9')
7601 break;
7602 if ((width*10) / 10 != width) {
7603 PyErr_SetString(PyExc_ValueError,
7604 "width too big");
7605 goto onError;
7606 }
7607 width = width*10 + (c - '0');
7608 }
7609 }
7610 if (c == '.') {
7611 prec = 0;
7612 if (--fmtcnt >= 0)
7613 c = *fmt++;
7614 if (c == '*') {
7615 v = getnextarg(args, arglen, &argidx);
7616 if (v == NULL)
7617 goto onError;
7618 if (!PyInt_Check(v)) {
7619 PyErr_SetString(PyExc_TypeError,
7620 "* wants int");
7621 goto onError;
7622 }
7623 prec = PyInt_AsLong(v);
7624 if (prec < 0)
7625 prec = 0;
7626 if (--fmtcnt >= 0)
7627 c = *fmt++;
7628 }
7629 else if (c >= '0' && c <= '9') {
7630 prec = c - '0';
7631 while (--fmtcnt >= 0) {
7632 c = Py_CHARMASK(*fmt++);
7633 if (c < '0' || c > '9')
7634 break;
7635 if ((prec*10) / 10 != prec) {
7636 PyErr_SetString(PyExc_ValueError,
7637 "prec too big");
7638 goto onError;
7639 }
7640 prec = prec*10 + (c - '0');
7641 }
7642 }
7643 } /* prec */
7644 if (fmtcnt >= 0) {
7645 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646 if (--fmtcnt >= 0)
7647 c = *fmt++;
7648 }
7649 }
7650 if (fmtcnt < 0) {
7651 PyErr_SetString(PyExc_ValueError,
7652 "incomplete format");
7653 goto onError;
7654 }
7655 if (c != '%') {
7656 v = getnextarg(args, arglen, &argidx);
7657 if (v == NULL)
7658 goto onError;
7659 }
7660 sign = 0;
7661 fill = ' ';
7662 switch (c) {
7663
7664 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007665 pbuf = formatbuf;
7666 /* presume that buffer length is at least 1 */
7667 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007668 len = 1;
7669 break;
7670
7671 case 's':
7672 case 'r':
7673 if (PyUnicode_Check(v) && c == 's') {
7674 temp = v;
7675 Py_INCREF(temp);
7676 }
7677 else {
7678 PyObject *unicode;
7679 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007680 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681 else
7682 temp = PyObject_Repr(v);
7683 if (temp == NULL)
7684 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007685 if (PyUnicode_Check(temp))
7686 /* nothing to do */;
7687 else if (PyString_Check(temp)) {
7688 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007689 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007691 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007693 Py_DECREF(temp);
7694 temp = unicode;
7695 if (temp == NULL)
7696 goto onError;
7697 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007698 else {
7699 Py_DECREF(temp);
7700 PyErr_SetString(PyExc_TypeError,
7701 "%s argument has non-string str()");
7702 goto onError;
7703 }
7704 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007705 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706 len = PyUnicode_GET_SIZE(temp);
7707 if (prec >= 0 && len > prec)
7708 len = prec;
7709 break;
7710
7711 case 'i':
7712 case 'd':
7713 case 'u':
7714 case 'o':
7715 case 'x':
7716 case 'X':
7717 if (c == 'i')
7718 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007719 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007720 temp = formatlong(v, flags, prec, c);
7721 if (!temp)
7722 goto onError;
7723 pbuf = PyUnicode_AS_UNICODE(temp);
7724 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007725 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007727 else {
7728 pbuf = formatbuf;
7729 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7730 flags, prec, c, v);
7731 if (len < 0)
7732 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007733 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007734 }
7735 if (flags & F_ZERO)
7736 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737 break;
7738
7739 case 'e':
7740 case 'E':
7741 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007742 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743 case 'g':
7744 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007745 if (c == 'F')
7746 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007747 pbuf = formatbuf;
7748 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7749 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750 if (len < 0)
7751 goto onError;
7752 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007753 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754 fill = '0';
7755 break;
7756
7757 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007758 pbuf = formatbuf;
7759 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760 if (len < 0)
7761 goto onError;
7762 break;
7763
7764 default:
7765 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007766 "unsupported format character '%c' (0x%x) "
Armin Rigo7ccbca92006-10-04 12:17:45 +00007767 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007768 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007769 (int)c,
Armin Rigo7ccbca92006-10-04 12:17:45 +00007770 (Py_ssize_t)(fmt - 1 -
7771 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772 goto onError;
7773 }
7774 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007775 if (*pbuf == '-' || *pbuf == '+') {
7776 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777 len--;
7778 }
7779 else if (flags & F_SIGN)
7780 sign = '+';
7781 else if (flags & F_BLANK)
7782 sign = ' ';
7783 else
7784 sign = 0;
7785 }
7786 if (width < len)
7787 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007788 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789 reslen -= rescnt;
7790 rescnt = width + fmtcnt + 100;
7791 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007792 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007793 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007794 PyErr_NoMemory();
7795 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007796 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007797 if (_PyUnicode_Resize(&result, reslen) < 0) {
7798 Py_XDECREF(temp);
7799 goto onError;
7800 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 res = PyUnicode_AS_UNICODE(result)
7802 + reslen - rescnt;
7803 }
7804 if (sign) {
7805 if (fill != ' ')
7806 *res++ = sign;
7807 rescnt--;
7808 if (width > len)
7809 width--;
7810 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007811 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7812 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007813 assert(pbuf[1] == c);
7814 if (fill != ' ') {
7815 *res++ = *pbuf++;
7816 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007817 }
Tim Petersfff53252001-04-12 18:38:48 +00007818 rescnt -= 2;
7819 width -= 2;
7820 if (width < 0)
7821 width = 0;
7822 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007823 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824 if (width > len && !(flags & F_LJUST)) {
7825 do {
7826 --rescnt;
7827 *res++ = fill;
7828 } while (--width > len);
7829 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007830 if (fill == ' ') {
7831 if (sign)
7832 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007833 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007834 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007835 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007836 *res++ = *pbuf++;
7837 *res++ = *pbuf++;
7838 }
7839 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007840 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841 res += len;
7842 rescnt -= len;
7843 while (--width >= len) {
7844 --rescnt;
7845 *res++ = ' ';
7846 }
7847 if (dict && (argidx < arglen) && c != '%') {
7848 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007849 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007850 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851 goto onError;
7852 }
7853 Py_XDECREF(temp);
7854 } /* '%' */
7855 } /* until end */
7856 if (argidx < arglen && !dict) {
7857 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007858 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859 goto onError;
7860 }
7861
Thomas Woutersa96affe2006-03-12 00:29:36 +00007862 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7863 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864 if (args_owned) {
7865 Py_DECREF(args);
7866 }
7867 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868 return (PyObject *)result;
7869
7870 onError:
7871 Py_XDECREF(result);
7872 Py_DECREF(uformat);
7873 if (args_owned) {
7874 Py_DECREF(args);
7875 }
7876 return NULL;
7877}
7878
7879static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007880 (readbufferproc) unicode_buffer_getreadbuf,
7881 (writebufferproc) unicode_buffer_getwritebuf,
7882 (segcountproc) unicode_buffer_getsegcount,
7883 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884};
7885
Jeremy Hylton938ace62002-07-17 16:30:39 +00007886static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007887unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7888
Tim Peters6d6c1a32001-08-02 04:15:00 +00007889static PyObject *
7890unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7891{
7892 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007893 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007894 char *encoding = NULL;
7895 char *errors = NULL;
7896
Guido van Rossume023fe02001-08-30 03:12:59 +00007897 if (type != &PyUnicode_Type)
7898 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007899 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7900 kwlist, &x, &encoding, &errors))
7901 return NULL;
7902 if (x == NULL)
7903 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007904 if (encoding == NULL && errors == NULL)
7905 return PyObject_Unicode(x);
7906 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007907 return PyUnicode_FromEncodedObject(x, encoding, errors);
7908}
7909
Guido van Rossume023fe02001-08-30 03:12:59 +00007910static PyObject *
7911unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7912{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007913 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007914 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007915
7916 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7917 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7918 if (tmp == NULL)
7919 return NULL;
7920 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007921 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007922 if (pnew == NULL) {
7923 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007924 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007925 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007926 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7927 if (pnew->str == NULL) {
7928 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007929 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007930 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007931 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007932 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007933 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7934 pnew->length = n;
7935 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007936 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007937 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007938}
7939
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007940PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007941"unicode(string [, encoding[, errors]]) -> object\n\
7942\n\
7943Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007944encoding defaults to the current default string encoding.\n\
7945errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007946
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947PyTypeObject PyUnicode_Type = {
7948 PyObject_HEAD_INIT(&PyType_Type)
7949 0, /* ob_size */
7950 "unicode", /* tp_name */
7951 sizeof(PyUnicodeObject), /* tp_size */
7952 0, /* tp_itemsize */
7953 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007954 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007956 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00007958 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007959 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007960 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007962 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963 (hashfunc) unicode_hash, /* tp_hash*/
7964 0, /* tp_call*/
7965 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007966 PyObject_GenericGetAttr, /* tp_getattro */
7967 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007969 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7970 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007971 unicode_doc, /* tp_doc */
7972 0, /* tp_traverse */
7973 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00007974 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007975 0, /* tp_weaklistoffset */
7976 0, /* tp_iter */
7977 0, /* tp_iternext */
7978 unicode_methods, /* tp_methods */
7979 0, /* tp_members */
7980 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007981 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007982 0, /* tp_dict */
7983 0, /* tp_descr_get */
7984 0, /* tp_descr_set */
7985 0, /* tp_dictoffset */
7986 0, /* tp_init */
7987 0, /* tp_alloc */
7988 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007989 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990};
7991
7992/* Initialize the Unicode implementation */
7993
Thomas Wouters78890102000-07-22 19:25:51 +00007994void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007996 int i;
7997
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007998 /* XXX - move this array to unicodectype.c ? */
7999 Py_UNICODE linebreak[] = {
8000 0x000A, /* LINE FEED */
8001 0x000D, /* CARRIAGE RETURN */
8002 0x001C, /* FILE SEPARATOR */
8003 0x001D, /* GROUP SEPARATOR */
8004 0x001E, /* RECORD SEPARATOR */
8005 0x0085, /* NEXT LINE */
8006 0x2028, /* LINE SEPARATOR */
8007 0x2029, /* PARAGRAPH SEPARATOR */
8008 };
8009
Fred Drakee4315f52000-05-09 19:53:39 +00008010 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008011 unicode_freelist = NULL;
8012 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008014 if (!unicode_empty)
8015 return;
8016
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008017 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008018 for (i = 0; i < 256; i++)
8019 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008020 if (PyType_Ready(&PyUnicode_Type) < 0)
8021 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008022
8023 /* initialize the linebreak bloom filter */
8024 bloom_linebreak = make_bloom_mask(
8025 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8026 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008027
8028 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029}
8030
8031/* Finalize the Unicode implementation */
8032
8033void
Thomas Wouters78890102000-07-22 19:25:51 +00008034_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008036 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008037 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008039 Py_XDECREF(unicode_empty);
8040 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008041
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008042 for (i = 0; i < 256; i++) {
8043 if (unicode_latin1[i]) {
8044 Py_DECREF(unicode_latin1[i]);
8045 unicode_latin1[i] = NULL;
8046 }
8047 }
8048
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008049 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050 PyUnicodeObject *v = u;
8051 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008052 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008053 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008054 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008055 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008057 unicode_freelist = NULL;
8058 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008059}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008060
Anthony Baxterac6bd462006-04-13 02:06:09 +00008061#ifdef __cplusplus
8062}
8063#endif
8064
8065
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008066/*
8067Local variables:
8068c-basic-offset: 4
8069indent-tabs-mode: nil
8070End:
8071*/