blob: 2ae3f61d230054326c4b10e4639fd7c245522484 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
Fredrik Lundh77633512006-05-23 19:47:35 +0000166 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000187
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000199 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000205 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 PyErr_NoMemory();
207 return -1;
208 }
209 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000210 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000212 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 }
218 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000219
Guido van Rossumd57fd912000-03-10 22:53:23 +0000220 return 0;
221}
222
223/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000224 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
228
229*/
230
231static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233{
234 register PyUnicodeObject *unicode;
235
Andrew Dalkee0df7622006-05-27 11:04:36 +0000236 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
240 }
241
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000245 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000250 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000251 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 }
255 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000256 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000258 }
259 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266 }
267
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000268 if (!unicode->str) {
269 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000270 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000271 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000272 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
278 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000283 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000288 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290}
291
292static
Guido van Rossum9475a232001-10-05 20:51:39 +0000293void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000299 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 unicode->str = NULL;
301 unicode->length = 0;
302 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000306 }
307 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000313 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000314 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000315 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317}
318
Martin v. Löwis18e16552006-02-15 17:27:45 +0000319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000320{
321 register PyUnicodeObject *v;
322
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
327 }
328 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000329 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 PyErr_BadInternalCall();
331 return -1;
332 }
333
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000337 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000344 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000345 *unicode = (PyObject *)w;
346 return 0;
347 }
348
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360{
361 PyUnicodeObject *unicode;
362
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
366
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
371 }
372
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 if (!unicode)
380 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000381 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000382 unicode_latin1[*u] = unicode;
383 }
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
386 }
387 }
Tim Petersced69f82003-09-16 20:30:58 +0000388
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
392
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396
397 return (PyObject *)unicode;
398}
399
400#ifdef HAVE_WCHAR_H
401
402PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000403 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404{
405 PyUnicodeObject *unicode;
406
407 if (w == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
411
412 unicode = _PyUnicode_New(size);
413 if (!unicode)
414 return NULL;
415
416 /* Copy the wchar_t data into the new object */
417#ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000419#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000420 {
421 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000422 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000424 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 *u++ = *w++;
426 }
427#endif
428
429 return (PyObject *)unicode;
430}
431
Martin v. Löwis18e16552006-02-15 17:27:45 +0000432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433 wchar_t *w,
434 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435{
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
439 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000440
441 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000443 size = PyUnicode_GET_SIZE(unicode) + 1;
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445#ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w, unicode->str, size * sizeof(wchar_t));
447#else
448 {
449 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000450 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000452 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453 *w++ = *u++;
454 }
455#endif
456
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000457 if (size > PyUnicode_GET_SIZE(unicode))
458 return PyUnicode_GET_SIZE(unicode);
459 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 return size;
461}
462
463#endif
464
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000465PyObject *PyUnicode_FromOrdinal(int ordinal)
466{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000467 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000468
469#ifdef Py_UNICODE_WIDE
470 if (ordinal < 0 || ordinal > 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
474 return NULL;
475 }
476#else
477 if (ordinal < 0 || ordinal > 0xffff) {
478 PyErr_SetString(PyExc_ValueError,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
481 return NULL;
482 }
483#endif
484
Hye-Shik Chang40574832004-04-06 07:24:51 +0000485 s[0] = (Py_UNICODE)ordinal;
486 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000487}
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489PyObject *PyUnicode_FromObject(register PyObject *obj)
490{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj)) {
494 Py_INCREF(obj);
495 return obj;
496 }
497 if (PyUnicode_Check(obj)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501 PyUnicode_GET_SIZE(obj));
502 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000503 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504}
505
506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507 const char *encoding,
508 const char *errors)
509{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000511 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 if (obj == NULL) {
515 PyErr_BadInternalCall();
516 return NULL;
517 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000518
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000519#if 0
520 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
523 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000524
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
527
528 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529 if (PyUnicode_Check(obj)) {
530 if (encoding) {
531 PyErr_SetString(PyExc_TypeError,
532 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000533 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000536 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537#else
538 if (PyUnicode_Check(obj)) {
539 PyErr_SetString(PyExc_TypeError,
540 "decoding Unicode is not supported");
541 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000542 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000543#endif
544
545 /* Coerce object */
546 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000547 s = PyString_AS_STRING(obj);
548 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000549 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000550 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 "coercing to Unicode: need string or buffer, "
556 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000557 obj->ob_type->tp_name);
558 goto onError;
559 }
Tim Petersced69f82003-09-16 20:30:58 +0000560
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000561 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 if (len == 0) {
563 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000564 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 }
Tim Petersced69f82003-09-16 20:30:58 +0000566 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000567 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000568
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 return v;
570
571 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573}
574
575PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000576 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 const char *encoding,
578 const char *errors)
579{
580 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000581
582 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000583 encoding = PyUnicode_GetDefaultEncoding();
584
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000587 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000588 else if (strcmp(encoding, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s, size, errors);
593#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596
597 /* Decode via the codec registry */
598 buffer = PyBuffer_FromMemory((void *)s, size);
599 if (buffer == NULL)
600 goto onError;
601 unicode = PyCodec_Decode(buffer, encoding, errors);
602 if (unicode == NULL)
603 goto onError;
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000606 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607 unicode->ob_type->tp_name);
608 Py_DECREF(unicode);
609 goto onError;
610 }
611 Py_DECREF(buffer);
612 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000613
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 onError:
615 Py_XDECREF(buffer);
616 return NULL;
617}
618
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620 const char *encoding,
621 const char *errors)
622{
623 PyObject *v;
624
625 if (!PyUnicode_Check(unicode)) {
626 PyErr_BadArgument();
627 goto onError;
628 }
629
630 if (encoding == NULL)
631 encoding = PyUnicode_GetDefaultEncoding();
632
633 /* Decode via the codec registry */
634 v = PyCodec_Decode(unicode, encoding, errors);
635 if (v == NULL)
636 goto onError;
637 return v;
638
639 onError:
640 return NULL;
641}
642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 const char *encoding,
646 const char *errors)
647{
648 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = PyUnicode_FromUnicode(s, size);
651 if (unicode == NULL)
652 return NULL;
653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654 Py_DECREF(unicode);
655 return v;
656}
657
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659 const char *encoding,
660 const char *errors)
661{
662 PyObject *v;
663
664 if (!PyUnicode_Check(unicode)) {
665 PyErr_BadArgument();
666 goto onError;
667 }
668
669 if (encoding == NULL)
670 encoding = PyUnicode_GetDefaultEncoding();
671
672 /* Encode via the codec registry */
673 v = PyCodec_Encode(unicode, encoding, errors);
674 if (v == NULL)
675 goto onError;
676 return v;
677
678 onError:
679 return NULL;
680}
681
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683 const char *encoding,
684 const char *errors)
685{
686 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000687
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688 if (!PyUnicode_Check(unicode)) {
689 PyErr_BadArgument();
690 goto onError;
691 }
Fred Drakee4315f52000-05-09 19:53:39 +0000692
Tim Petersced69f82003-09-16 20:30:58 +0000693 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000694 encoding = PyUnicode_GetDefaultEncoding();
695
696 /* Shortcuts for common default encodings */
697 if (errors == NULL) {
698 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000699 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000700 else if (strcmp(encoding, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode);
705#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000706 else if (strcmp(encoding, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode);
708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709
710 /* Encode via the codec registry */
711 v = PyCodec_Encode(unicode, encoding, errors);
712 if (v == NULL)
713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 if (!PyString_Check(v)) {
715 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000716 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 v->ob_type->tp_name);
718 Py_DECREF(v);
719 goto onError;
720 }
721 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000722
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 onError:
724 return NULL;
725}
726
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728 const char *errors)
729{
730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
731
732 if (v)
733 return v;
734 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735 if (v && errors == NULL)
736 ((PyUnicodeObject *)unicode)->defenc = v;
737 return v;
738}
739
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
741{
742 if (!PyUnicode_Check(unicode)) {
743 PyErr_BadArgument();
744 goto onError;
745 }
746 return PyUnicode_AS_UNICODE(unicode);
747
748 onError:
749 return NULL;
750}
751
Martin v. Löwis18e16552006-02-15 17:27:45 +0000752Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753{
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
757 }
758 return PyUnicode_GET_SIZE(unicode);
759
760 onError:
761 return -1;
762}
763
Thomas Wouters78890102000-07-22 19:25:51 +0000764const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000765{
766 return unicode_default_encoding;
767}
768
769int PyUnicode_SetDefaultEncoding(const char *encoding)
770{
771 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000772
Fred Drakee4315f52000-05-09 19:53:39 +0000773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v = _PyCodec_Lookup(encoding);
776 if (v == NULL)
777 goto onError;
778 Py_DECREF(v);
779 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000780 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000781 sizeof(unicode_default_encoding));
782 return 0;
783
784 onError:
785 return -1;
786}
787
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000788/* error handling callback helper:
789 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000790 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000791 and adjust various state variables.
792 return 0 on success, -1 on error
793*/
794
795static
796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
799 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000800{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000801 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000802
803 PyObject *restuple = NULL;
804 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000805 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
806 Py_ssize_t requiredsize;
807 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000808 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000809 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000810 int res = -1;
811
812 if (*errorHandler == NULL) {
813 *errorHandler = PyCodec_LookupError(errors);
814 if (*errorHandler == NULL)
815 goto onError;
816 }
817
818 if (*exceptionObject == NULL) {
819 *exceptionObject = PyUnicodeDecodeError_Create(
820 encoding, input, insize, *startinpos, *endinpos, reason);
821 if (*exceptionObject == NULL)
822 goto onError;
823 }
824 else {
825 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
826 goto onError;
827 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
828 goto onError;
829 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
830 goto onError;
831 }
832
833 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
834 if (restuple == NULL)
835 goto onError;
836 if (!PyTuple_Check(restuple)) {
837 PyErr_Format(PyExc_TypeError, &argparse[4]);
838 goto onError;
839 }
840 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
841 goto onError;
842 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000843 newpos = insize+newpos;
844 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000845 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000846 goto onError;
847 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000848
849 /* need more space? (at least enough for what we
850 have+the replacement+the rest of the string (starting
851 at the new input position), so we won't have to check space
852 when there are no errors in the rest of the string) */
853 repptr = PyUnicode_AS_UNICODE(repunicode);
854 repsize = PyUnicode_GET_SIZE(repunicode);
855 requiredsize = *outpos + repsize + insize-newpos;
856 if (requiredsize > outsize) {
857 if (requiredsize<2*outsize)
858 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000859 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000860 goto onError;
861 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
862 }
863 *endinpos = newpos;
864 *inptr = input + newpos;
865 Py_UNICODE_COPY(*outptr, repptr, repsize);
866 *outptr += repsize;
867 *outpos += repsize;
868 /* we made it! */
869 res = 0;
870
871 onError:
872 Py_XDECREF(restuple);
873 return res;
874}
875
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000876/* --- UTF-7 Codec -------------------------------------------------------- */
877
878/* see RFC2152 for details */
879
Tim Petersced69f82003-09-16 20:30:58 +0000880static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881char utf7_special[128] = {
882 /* indicate whether a UTF-7 character is special i.e. cannot be directly
883 encoded:
884 0 - not special
885 1 - special
886 2 - whitespace (optional)
887 3 - RFC2152 Set O (optional) */
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
892 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
894 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
896
897};
898
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000899/* Note: The comparison (c) <= 0 is a trick to work-around gcc
900 warnings about the comparison always being false; since
901 utf7_special[0] is 1, we can safely make that one comparison
902 true */
903
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000904#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000905 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000906 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907 (encodeO && (utf7_special[(c)] == 3)))
908
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000909#define B64(n) \
910 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
911#define B64CHAR(c) \
912 (isalnum(c) || (c) == '+' || (c) == '/')
913#define UB64(c) \
914 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
915 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000916
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000917#define ENCODE(out, ch, bits) \
918 while (bits >= 6) { \
919 *out++ = B64(ch >> (bits-6)); \
920 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921 }
922
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000923#define DECODE(out, ch, bits, surrogate) \
924 while (bits >= 16) { \
925 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
926 bits -= 16; \
927 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000928 /* We have already generated an error for the high surrogate \
929 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000930 surrogate = 0; \
931 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000932 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000933 it in a 16-bit character */ \
934 surrogate = 1; \
935 errmsg = "code pairs are not supported"; \
936 goto utf7Error; \
937 } else { \
938 *out++ = outCh; \
939 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000940 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000941
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000943 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 const char *errors)
945{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000946 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000947 Py_ssize_t startinpos;
948 Py_ssize_t endinpos;
949 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950 const char *e;
951 PyUnicodeObject *unicode;
952 Py_UNICODE *p;
953 const char *errmsg = "";
954 int inShift = 0;
955 unsigned int bitsleft = 0;
956 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000957 int surrogate = 0;
958 PyObject *errorHandler = NULL;
959 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000960
961 unicode = _PyUnicode_New(size);
962 if (!unicode)
963 return NULL;
964 if (size == 0)
965 return (PyObject *)unicode;
966
967 p = unicode->str;
968 e = s + size;
969
970 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000971 Py_UNICODE ch;
972 restart:
973 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974
975 if (inShift) {
976 if ((ch == '-') || !B64CHAR(ch)) {
977 inShift = 0;
978 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000979
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
981 if (bitsleft >= 6) {
982 /* The shift sequence has a partial character in it. If
983 bitsleft < 6 then we could just classify it as padding
984 but that is not the case here */
985
986 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000987 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000988 }
989 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000990 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000991 here so indicate the potential of a misencoded character. */
992
993 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
994 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
995 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000996 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 }
998
999 if (ch == '-') {
1000 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001001 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 inShift = 1;
1003 }
1004 } else if (SPECIAL(ch,0,0)) {
1005 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001006 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 } else {
1008 *p++ = ch;
1009 }
1010 } else {
1011 charsleft = (charsleft << 6) | UB64(ch);
1012 bitsleft += 6;
1013 s++;
1014 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1015 }
1016 }
1017 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001019 s++;
1020 if (s < e && *s == '-') {
1021 s++;
1022 *p++ = '+';
1023 } else
1024 {
1025 inShift = 1;
1026 bitsleft = 0;
1027 }
1028 }
1029 else if (SPECIAL(ch,0,0)) {
1030 errmsg = "unexpected special character";
1031 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001032 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001033 }
1034 else {
1035 *p++ = ch;
1036 s++;
1037 }
1038 continue;
1039 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001040 outpos = p-PyUnicode_AS_UNICODE(unicode);
1041 endinpos = s-starts;
1042 if (unicode_decode_call_errorhandler(
1043 errors, &errorHandler,
1044 "utf7", errmsg,
1045 starts, size, &startinpos, &endinpos, &exc, &s,
1046 (PyObject **)&unicode, &outpos, &p))
1047 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001048 }
1049
1050 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001051 outpos = p-PyUnicode_AS_UNICODE(unicode);
1052 endinpos = size;
1053 if (unicode_decode_call_errorhandler(
1054 errors, &errorHandler,
1055 "utf7", "unterminated shift sequence",
1056 starts, size, &startinpos, &endinpos, &exc, &s,
1057 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001059 if (s < e)
1060 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001063 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001064 goto onError;
1065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001066 Py_XDECREF(errorHandler);
1067 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 return (PyObject *)unicode;
1069
1070onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071 Py_XDECREF(errorHandler);
1072 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 Py_DECREF(unicode);
1074 return NULL;
1075}
1076
1077
1078PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001079 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080 int encodeSetO,
1081 int encodeWhiteSpace,
1082 const char *errors)
1083{
1084 PyObject *v;
1085 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001086 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001087 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001089 unsigned int bitsleft = 0;
1090 unsigned long charsleft = 0;
1091 char * out;
1092 char * start;
1093
1094 if (size == 0)
1095 return PyString_FromStringAndSize(NULL, 0);
1096
1097 v = PyString_FromStringAndSize(NULL, cbAllocated);
1098 if (v == NULL)
1099 return NULL;
1100
1101 start = out = PyString_AS_STRING(v);
1102 for (;i < size; ++i) {
1103 Py_UNICODE ch = s[i];
1104
1105 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001106 if (ch == '+') {
1107 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001108 *out++ = '-';
1109 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1110 charsleft = ch;
1111 bitsleft = 16;
1112 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001113 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001114 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001115 } else {
1116 *out++ = (char) ch;
1117 }
1118 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1120 *out++ = B64(charsleft << (6-bitsleft));
1121 charsleft = 0;
1122 bitsleft = 0;
1123 /* Characters not in the BASE64 set implicitly unshift the sequence
1124 so no '-' is required, except if the character is itself a '-' */
1125 if (B64CHAR(ch) || ch == '-') {
1126 *out++ = '-';
1127 }
1128 inShift = 0;
1129 *out++ = (char) ch;
1130 } else {
1131 bitsleft += 16;
1132 charsleft = (charsleft << 16) | ch;
1133 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1134
1135 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001136 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001137 or '-' then the shift sequence will be terminated implicitly and we
1138 don't have to insert a '-'. */
1139
1140 if (bitsleft == 0) {
1141 if (i + 1 < size) {
1142 Py_UNICODE ch2 = s[i+1];
1143
1144 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001145
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001146 } else if (B64CHAR(ch2) || ch2 == '-') {
1147 *out++ = '-';
1148 inShift = 0;
1149 } else {
1150 inShift = 0;
1151 }
1152
1153 }
1154 else {
1155 *out++ = '-';
1156 inShift = 0;
1157 }
1158 }
Tim Petersced69f82003-09-16 20:30:58 +00001159 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001160 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001161 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001162 if (bitsleft) {
1163 *out++= B64(charsleft << (6-bitsleft) );
1164 *out++ = '-';
1165 }
1166
Tim Peters5de98422002-04-27 18:44:32 +00001167 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001168 return v;
1169}
1170
1171#undef SPECIAL
1172#undef B64
1173#undef B64CHAR
1174#undef UB64
1175#undef ENCODE
1176#undef DECODE
1177
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178/* --- UTF-8 Codec -------------------------------------------------------- */
1179
Tim Petersced69f82003-09-16 20:30:58 +00001180static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181char utf8_code_length[256] = {
1182 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1183 illegal prefix. see RFC 2279 for details */
1184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1200};
1201
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001203 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 const char *errors)
1205{
Walter Dörwald69652032004-09-07 20:24:22 +00001206 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1207}
1208
1209PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001210 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001211 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001212 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001213{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001214 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001216 Py_ssize_t startinpos;
1217 Py_ssize_t endinpos;
1218 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 const char *e;
1220 PyUnicodeObject *unicode;
1221 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001222 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001223 PyObject *errorHandler = NULL;
1224 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
1226 /* Note: size will always be longer than the resulting Unicode
1227 character count */
1228 unicode = _PyUnicode_New(size);
1229 if (!unicode)
1230 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001231 if (size == 0) {
1232 if (consumed)
1233 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236
1237 /* Unpack UTF-8 encoded data */
1238 p = unicode->str;
1239 e = s + size;
1240
1241 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001242 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243
1244 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001245 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 s++;
1247 continue;
1248 }
1249
1250 n = utf8_code_length[ch];
1251
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001252 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001253 if (consumed)
1254 break;
1255 else {
1256 errmsg = "unexpected end of data";
1257 startinpos = s-starts;
1258 endinpos = size;
1259 goto utf8Error;
1260 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262
1263 switch (n) {
1264
1265 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001266 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001267 startinpos = s-starts;
1268 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001269 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001270
1271 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001272 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273 startinpos = s-starts;
1274 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001275 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276
1277 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001278 if ((s[1] & 0xc0) != 0x80) {
1279 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
1283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 errmsg = "illegal encoding";
1289 goto utf8Error;
1290 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001292 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 break;
1294
1295 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001296 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 (s[2] & 0xc0) != 0x80) {
1298 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299 startinpos = s-starts;
1300 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 goto utf8Error;
1302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001304 if (ch < 0x0800) {
1305 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001306 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001307
1308 XXX For wide builds (UCS-4) we should probably try
1309 to recombine the surrogates into a single code
1310 unit.
1311 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001312 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001313 startinpos = s-starts;
1314 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001315 goto utf8Error;
1316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001318 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001319 break;
1320
1321 case 4:
1322 if ((s[1] & 0xc0) != 0x80 ||
1323 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001324 (s[3] & 0xc0) != 0x80) {
1325 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 startinpos = s-starts;
1327 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001328 goto utf8Error;
1329 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001330 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1331 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1332 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001333 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001334 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001335 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001336 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001337 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001338 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 startinpos = s-starts;
1340 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001341 goto utf8Error;
1342 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001343#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001344 *p++ = (Py_UNICODE)ch;
1345#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001346 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001347
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 /* translate from 10000..10FFFF to 0..FFFF */
1349 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001350
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001351 /* high surrogate = top 10 bits added to D800 */
1352 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001353
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001354 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001355 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001356#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 break;
1358
1359 default:
1360 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001361 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001362 startinpos = s-starts;
1363 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001364 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 }
1366 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001367 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001368
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001370 outpos = p-PyUnicode_AS_UNICODE(unicode);
1371 if (unicode_decode_call_errorhandler(
1372 errors, &errorHandler,
1373 "utf8", errmsg,
1374 starts, size, &startinpos, &endinpos, &exc, &s,
1375 (PyObject **)&unicode, &outpos, &p))
1376 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 }
Walter Dörwald69652032004-09-07 20:24:22 +00001378 if (consumed)
1379 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380
1381 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001382 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001383 goto onError;
1384
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001385 Py_XDECREF(errorHandler);
1386 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 return (PyObject *)unicode;
1388
1389onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 Py_XDECREF(errorHandler);
1391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 Py_DECREF(unicode);
1393 return NULL;
1394}
1395
Tim Peters602f7402002-04-27 18:03:26 +00001396/* Allocation strategy: if the string is short, convert into a stack buffer
1397 and allocate exactly as much space needed at the end. Else allocate the
1398 maximum possible needed (4 result bytes per Unicode character), and return
1399 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001400*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001401PyObject *
1402PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001403 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001404 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405{
Tim Peters602f7402002-04-27 18:03:26 +00001406#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001407
Martin v. Löwis18e16552006-02-15 17:27:45 +00001408 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001409 PyObject *v; /* result string object */
1410 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001412 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001413 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001414
Tim Peters602f7402002-04-27 18:03:26 +00001415 assert(s != NULL);
1416 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417
Tim Peters602f7402002-04-27 18:03:26 +00001418 if (size <= MAX_SHORT_UNICHARS) {
1419 /* Write into the stack buffer; nallocated can't overflow.
1420 * At the end, we'll allocate exactly as much heap space as it
1421 * turns out we need.
1422 */
1423 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1424 v = NULL; /* will allocate after we're done */
1425 p = stackbuf;
1426 }
1427 else {
1428 /* Overallocate on the heap, and give the excess back at the end. */
1429 nallocated = size * 4;
1430 if (nallocated / 4 != size) /* overflow! */
1431 return PyErr_NoMemory();
1432 v = PyString_FromStringAndSize(NULL, nallocated);
1433 if (v == NULL)
1434 return NULL;
1435 p = PyString_AS_STRING(v);
1436 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001437
Tim Peters602f7402002-04-27 18:03:26 +00001438 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001439 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001440
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001441 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001442 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001444
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001446 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001447 *p++ = (char)(0xc0 | (ch >> 6));
1448 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001449 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001450 else {
Tim Peters602f7402002-04-27 18:03:26 +00001451 /* Encode UCS2 Unicode ordinals */
1452 if (ch < 0x10000) {
1453 /* Special case: check for high surrogate */
1454 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1455 Py_UCS4 ch2 = s[i];
1456 /* Check for low surrogate and combine the two to
1457 form a UCS4 value */
1458 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001459 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001460 i++;
1461 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001462 }
Tim Peters602f7402002-04-27 18:03:26 +00001463 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001464 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001465 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001466 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1467 *p++ = (char)(0x80 | (ch & 0x3f));
1468 continue;
1469 }
1470encodeUCS4:
1471 /* Encode UCS4 Unicode ordinals */
1472 *p++ = (char)(0xf0 | (ch >> 18));
1473 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1474 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1475 *p++ = (char)(0x80 | (ch & 0x3f));
1476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001478
Tim Peters602f7402002-04-27 18:03:26 +00001479 if (v == NULL) {
1480 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001481 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001482 assert(nneeded <= nallocated);
1483 v = PyString_FromStringAndSize(stackbuf, nneeded);
1484 }
1485 else {
1486 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001487 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001488 assert(nneeded <= nallocated);
1489 _PyString_Resize(&v, nneeded);
1490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001492
Tim Peters602f7402002-04-27 18:03:26 +00001493#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494}
1495
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1497{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
1500 return NULL;
1501 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001502 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1503 PyUnicode_GET_SIZE(unicode),
1504 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505}
1506
1507/* --- UTF-16 Codec ------------------------------------------------------- */
1508
Tim Peters772747b2001-08-09 22:21:55 +00001509PyObject *
1510PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001511 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001512 const char *errors,
1513 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001514{
Walter Dörwald69652032004-09-07 20:24:22 +00001515 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1516}
1517
1518PyObject *
1519PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001520 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001521 const char *errors,
1522 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001523 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001524{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001526 Py_ssize_t startinpos;
1527 Py_ssize_t endinpos;
1528 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529 PyUnicodeObject *unicode;
1530 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001531 const unsigned char *q, *e;
1532 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001533 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001534 /* Offsets from q for retrieving byte pairs in the right order. */
1535#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536 int ihi = 1, ilo = 0;
1537#else
1538 int ihi = 0, ilo = 1;
1539#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 PyObject *errorHandler = NULL;
1541 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542
1543 /* Note: size will always be longer than the resulting Unicode
1544 character count */
1545 unicode = _PyUnicode_New(size);
1546 if (!unicode)
1547 return NULL;
1548 if (size == 0)
1549 return (PyObject *)unicode;
1550
1551 /* Unpack UTF-16 encoded data */
1552 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001553 q = (unsigned char *)s;
1554 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555
1556 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001557 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001559 /* Check for BOM marks (U+FEFF) in the input and adjust current
1560 byte order setting accordingly. In native mode, the leading BOM
1561 mark is skipped, in all other modes, it is copied to the output
1562 stream as-is (giving a ZWNBSP character). */
1563 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001564 if (size >= 2) {
1565 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001566#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001567 if (bom == 0xFEFF) {
1568 q += 2;
1569 bo = -1;
1570 }
1571 else if (bom == 0xFFFE) {
1572 q += 2;
1573 bo = 1;
1574 }
Tim Petersced69f82003-09-16 20:30:58 +00001575#else
Walter Dörwald69652032004-09-07 20:24:22 +00001576 if (bom == 0xFEFF) {
1577 q += 2;
1578 bo = 1;
1579 }
1580 else if (bom == 0xFFFE) {
1581 q += 2;
1582 bo = -1;
1583 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001584#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001585 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587
Tim Peters772747b2001-08-09 22:21:55 +00001588 if (bo == -1) {
1589 /* force LE */
1590 ihi = 1;
1591 ilo = 0;
1592 }
1593 else if (bo == 1) {
1594 /* force BE */
1595 ihi = 0;
1596 ilo = 1;
1597 }
1598
1599 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001601 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001602 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001603 if (consumed)
1604 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001605 errmsg = "truncated data";
1606 startinpos = ((const char *)q)-starts;
1607 endinpos = ((const char *)e)-starts;
1608 goto utf16Error;
1609 /* The remaining input chars are ignored if the callback
1610 chooses to skip the input */
1611 }
1612 ch = (q[ihi] << 8) | q[ilo];
1613
Tim Peters772747b2001-08-09 22:21:55 +00001614 q += 2;
1615
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 if (ch < 0xD800 || ch > 0xDFFF) {
1617 *p++ = ch;
1618 continue;
1619 }
1620
1621 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001622 if (q >= e) {
1623 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001624 startinpos = (((const char *)q)-2)-starts;
1625 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001626 goto utf16Error;
1627 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001628 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001629 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1630 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001631 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001632#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001633 *p++ = ch;
1634 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001635#else
1636 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001637#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001638 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001639 }
1640 else {
1641 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 startinpos = (((const char *)q)-4)-starts;
1643 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001644 goto utf16Error;
1645 }
1646
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001648 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 startinpos = (((const char *)q)-2)-starts;
1650 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001651 /* Fall through to report the error */
1652
1653 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001654 outpos = p-PyUnicode_AS_UNICODE(unicode);
1655 if (unicode_decode_call_errorhandler(
1656 errors, &errorHandler,
1657 "utf16", errmsg,
1658 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1659 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 }
1662
1663 if (byteorder)
1664 *byteorder = bo;
1665
Walter Dörwald69652032004-09-07 20:24:22 +00001666 if (consumed)
1667 *consumed = (const char *)q-starts;
1668
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001670 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671 goto onError;
1672
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001673 Py_XDECREF(errorHandler);
1674 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675 return (PyObject *)unicode;
1676
1677onError:
1678 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001679 Py_XDECREF(errorHandler);
1680 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 return NULL;
1682}
1683
Tim Peters772747b2001-08-09 22:21:55 +00001684PyObject *
1685PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001686 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001687 const char *errors,
1688 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001689{
1690 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001691 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001692#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001693 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001694#else
1695 const int pairs = 0;
1696#endif
Tim Peters772747b2001-08-09 22:21:55 +00001697 /* Offsets from p for storing byte pairs in the right order. */
1698#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699 int ihi = 1, ilo = 0;
1700#else
1701 int ihi = 0, ilo = 1;
1702#endif
1703
1704#define STORECHAR(CH) \
1705 do { \
1706 p[ihi] = ((CH) >> 8) & 0xff; \
1707 p[ilo] = (CH) & 0xff; \
1708 p += 2; \
1709 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001711#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001712 for (i = pairs = 0; i < size; i++)
1713 if (s[i] >= 0x10000)
1714 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001715#endif
Tim Petersced69f82003-09-16 20:30:58 +00001716 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001717 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718 if (v == NULL)
1719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720
Tim Peters772747b2001-08-09 22:21:55 +00001721 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001723 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001724 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001725 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001726
1727 if (byteorder == -1) {
1728 /* force LE */
1729 ihi = 1;
1730 ilo = 0;
1731 }
1732 else if (byteorder == 1) {
1733 /* force BE */
1734 ihi = 0;
1735 ilo = 1;
1736 }
1737
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001738 while (size-- > 0) {
1739 Py_UNICODE ch = *s++;
1740 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001741#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001742 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001743 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1744 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001746#endif
Tim Peters772747b2001-08-09 22:21:55 +00001747 STORECHAR(ch);
1748 if (ch2)
1749 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001752#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753}
1754
1755PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1756{
1757 if (!PyUnicode_Check(unicode)) {
1758 PyErr_BadArgument();
1759 return NULL;
1760 }
1761 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1762 PyUnicode_GET_SIZE(unicode),
1763 NULL,
1764 0);
1765}
1766
1767/* --- Unicode Escape Codec ----------------------------------------------- */
1768
Fredrik Lundh06d12682001-01-24 07:59:11 +00001769static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001770
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001772 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773 const char *errors)
1774{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001776 Py_ssize_t startinpos;
1777 Py_ssize_t endinpos;
1778 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001781 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001783 char* message;
1784 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001785 PyObject *errorHandler = NULL;
1786 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001787
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 /* Escaped strings will always be longer than the resulting
1789 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 length after conversion to the true value.
1791 (but if the error callback returns a long replacement string
1792 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 v = _PyUnicode_New(size);
1794 if (v == NULL)
1795 goto onError;
1796 if (size == 0)
1797 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001798
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001799 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001801
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 while (s < end) {
1803 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001804 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001805 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806
1807 /* Non-escape characters are interpreted as Unicode ordinals */
1808 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001809 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 continue;
1811 }
1812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001813 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 /* \ - Escapes */
1815 s++;
1816 switch (*s++) {
1817
1818 /* \x escapes */
1819 case '\n': break;
1820 case '\\': *p++ = '\\'; break;
1821 case '\'': *p++ = '\''; break;
1822 case '\"': *p++ = '\"'; break;
1823 case 'b': *p++ = '\b'; break;
1824 case 'f': *p++ = '\014'; break; /* FF */
1825 case 't': *p++ = '\t'; break;
1826 case 'n': *p++ = '\n'; break;
1827 case 'r': *p++ = '\r'; break;
1828 case 'v': *p++ = '\013'; break; /* VT */
1829 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1830
1831 /* \OOO (octal) escapes */
1832 case '0': case '1': case '2': case '3':
1833 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001834 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001836 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001838 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001840 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 break;
1842
Fredrik Lundhccc74732001-02-18 22:13:49 +00001843 /* hex escapes */
1844 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001846 digits = 2;
1847 message = "truncated \\xXX escape";
1848 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849
Fredrik Lundhccc74732001-02-18 22:13:49 +00001850 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001852 digits = 4;
1853 message = "truncated \\uXXXX escape";
1854 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001855
Fredrik Lundhccc74732001-02-18 22:13:49 +00001856 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001857 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001858 digits = 8;
1859 message = "truncated \\UXXXXXXXX escape";
1860 hexescape:
1861 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 outpos = p-PyUnicode_AS_UNICODE(v);
1863 if (s+digits>end) {
1864 endinpos = size;
1865 if (unicode_decode_call_errorhandler(
1866 errors, &errorHandler,
1867 "unicodeescape", "end of string in escape sequence",
1868 starts, size, &startinpos, &endinpos, &exc, &s,
1869 (PyObject **)&v, &outpos, &p))
1870 goto onError;
1871 goto nextByte;
1872 }
1873 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001874 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001875 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001876 endinpos = (s+i+1)-starts;
1877 if (unicode_decode_call_errorhandler(
1878 errors, &errorHandler,
1879 "unicodeescape", message,
1880 starts, size, &startinpos, &endinpos, &exc, &s,
1881 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001882 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001883 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001884 }
1885 chr = (chr<<4) & ~0xF;
1886 if (c >= '0' && c <= '9')
1887 chr += c - '0';
1888 else if (c >= 'a' && c <= 'f')
1889 chr += 10 + c - 'a';
1890 else
1891 chr += 10 + c - 'A';
1892 }
1893 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001894 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001895 /* _decoding_error will have already written into the
1896 target buffer. */
1897 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001898 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001899 /* when we get here, chr is a 32-bit unicode character */
1900 if (chr <= 0xffff)
1901 /* UCS-2 character */
1902 *p++ = (Py_UNICODE) chr;
1903 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001904 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001905 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001906#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001907 *p++ = chr;
1908#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001909 chr -= 0x10000L;
1910 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001911 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001913 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001914 endinpos = s-starts;
1915 outpos = p-PyUnicode_AS_UNICODE(v);
1916 if (unicode_decode_call_errorhandler(
1917 errors, &errorHandler,
1918 "unicodeescape", "illegal Unicode character",
1919 starts, size, &startinpos, &endinpos, &exc, &s,
1920 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001921 goto onError;
1922 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001923 break;
1924
1925 /* \N{name} */
1926 case 'N':
1927 message = "malformed \\N character escape";
1928 if (ucnhash_CAPI == NULL) {
1929 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001930 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001931 m = PyImport_ImportModule("unicodedata");
1932 if (m == NULL)
1933 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001934 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001935 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001936 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001937 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001938 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001939 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001940 if (ucnhash_CAPI == NULL)
1941 goto ucnhashError;
1942 }
1943 if (*s == '{') {
1944 const char *start = s+1;
1945 /* look for the closing brace */
1946 while (*s != '}' && s < end)
1947 s++;
1948 if (s > start && s < end && *s == '}') {
1949 /* found a name. look it up in the unicode database */
1950 message = "unknown Unicode character name";
1951 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001952 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001953 goto store;
1954 }
1955 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001956 endinpos = s-starts;
1957 outpos = p-PyUnicode_AS_UNICODE(v);
1958 if (unicode_decode_call_errorhandler(
1959 errors, &errorHandler,
1960 "unicodeescape", message,
1961 starts, size, &startinpos, &endinpos, &exc, &s,
1962 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001963 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001964 break;
1965
1966 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001967 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001968 message = "\\ at end of string";
1969 s--;
1970 endinpos = s-starts;
1971 outpos = p-PyUnicode_AS_UNICODE(v);
1972 if (unicode_decode_call_errorhandler(
1973 errors, &errorHandler,
1974 "unicodeescape", message,
1975 starts, size, &startinpos, &endinpos, &exc, &s,
1976 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001977 goto onError;
1978 }
1979 else {
1980 *p++ = '\\';
1981 *p++ = (unsigned char)s[-1];
1982 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001983 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001985 nextByte:
1986 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001988 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001989 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001990 Py_XDECREF(errorHandler);
1991 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001993
Fredrik Lundhccc74732001-02-18 22:13:49 +00001994ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001995 PyErr_SetString(
1996 PyExc_UnicodeError,
1997 "\\N escapes not supported (can't load unicodedata module)"
1998 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001999 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002000 Py_XDECREF(errorHandler);
2001 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002002 return NULL;
2003
Fredrik Lundhccc74732001-02-18 22:13:49 +00002004onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002006 Py_XDECREF(errorHandler);
2007 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 return NULL;
2009}
2010
2011/* Return a Unicode-Escape string version of the Unicode object.
2012
2013 If quotes is true, the string is enclosed in u"" or u'' quotes as
2014 appropriate.
2015
2016*/
2017
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002018Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002019 Py_ssize_t size,
2020 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002021{
2022 /* like wcschr, but doesn't stop at NULL characters */
2023
2024 while (size-- > 0) {
2025 if (*s == ch)
2026 return s;
2027 s++;
2028 }
2029
2030 return NULL;
2031}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002032
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033static
2034PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002035 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 int quotes)
2037{
2038 PyObject *repr;
2039 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002041 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002043 /* Initial allocation is based on the longest-possible unichr
2044 escape.
2045
2046 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2047 unichr, so in this case it's the longest unichr escape. In
2048 narrow (UTF-16) builds this is five chars per source unichr
2049 since there are two unichrs in the surrogate pair, so in narrow
2050 (UTF-16) builds it's not the longest unichr escape.
2051
2052 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2053 so in the narrow (UTF-16) build case it's the longest unichr
2054 escape.
2055 */
2056
2057 repr = PyString_FromStringAndSize(NULL,
2058 2
2059#ifdef Py_UNICODE_WIDE
2060 + 10*size
2061#else
2062 + 6*size
2063#endif
2064 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 if (repr == NULL)
2066 return NULL;
2067
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002068 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069
2070 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002072 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073 !findchar(s, size, '"')) ? '"' : '\'';
2074 }
2075 while (size-- > 0) {
2076 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002077
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002078 /* Escape quotes and backslashes */
2079 if ((quotes &&
2080 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081 *p++ = '\\';
2082 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002083 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002084 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002085
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002086#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002087 /* Map 21-bit characters to '\U00xxxxxx' */
2088 else if (ch >= 0x10000) {
2089 *p++ = '\\';
2090 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002091 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2092 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2093 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2094 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2095 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2096 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2097 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002098 *p++ = hexdigit[ch & 0x0000000F];
2099 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002100 }
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002101#else
2102 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002103 else if (ch >= 0xD800 && ch < 0xDC00) {
2104 Py_UNICODE ch2;
2105 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002106
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002107 ch2 = *s++;
2108 size--;
2109 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2110 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2111 *p++ = '\\';
2112 *p++ = 'U';
2113 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2114 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2115 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2116 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2117 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2118 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2119 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2120 *p++ = hexdigit[ucs & 0x0000000F];
2121 continue;
2122 }
2123 /* Fall through: isolated surrogates are copied as-is */
2124 s--;
2125 size++;
2126 }
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002127#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002128
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002130 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131 *p++ = '\\';
2132 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002133 *p++ = hexdigit[(ch >> 12) & 0x000F];
2134 *p++ = hexdigit[(ch >> 8) & 0x000F];
2135 *p++ = hexdigit[(ch >> 4) & 0x000F];
2136 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002137 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002138
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002139 /* Map special whitespace to '\t', \n', '\r' */
2140 else if (ch == '\t') {
2141 *p++ = '\\';
2142 *p++ = 't';
2143 }
2144 else if (ch == '\n') {
2145 *p++ = '\\';
2146 *p++ = 'n';
2147 }
2148 else if (ch == '\r') {
2149 *p++ = '\\';
2150 *p++ = 'r';
2151 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002152
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002153 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002154 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002155 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002156 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002157 *p++ = hexdigit[(ch >> 4) & 0x000F];
2158 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002159 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002160
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161 /* Copy everything else as-is */
2162 else
2163 *p++ = (char) ch;
2164 }
2165 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002166 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167
2168 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002169 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170 return repr;
2171}
2172
2173PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002174 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175{
2176 return unicodeescape_string(s, size, 0);
2177}
2178
2179PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2180{
2181 if (!PyUnicode_Check(unicode)) {
2182 PyErr_BadArgument();
2183 return NULL;
2184 }
2185 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2186 PyUnicode_GET_SIZE(unicode));
2187}
2188
2189/* --- Raw Unicode Escape Codec ------------------------------------------- */
2190
2191PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002192 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193 const char *errors)
2194{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002195 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002196 Py_ssize_t startinpos;
2197 Py_ssize_t endinpos;
2198 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002200 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201 const char *end;
2202 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002203 PyObject *errorHandler = NULL;
2204 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002205
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206 /* Escaped strings will always be longer than the resulting
2207 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002208 length after conversion to the true value. (But decoding error
2209 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210 v = _PyUnicode_New(size);
2211 if (v == NULL)
2212 goto onError;
2213 if (size == 0)
2214 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002215 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216 end = s + size;
2217 while (s < end) {
2218 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002219 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002221 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222
2223 /* Non-escape characters are interpreted as Unicode ordinals */
2224 if (*s != '\\') {
2225 *p++ = (unsigned char)*s++;
2226 continue;
2227 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002228 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002229
2230 /* \u-escapes are only interpreted iff the number of leading
2231 backslashes if odd */
2232 bs = s;
2233 for (;s < end;) {
2234 if (*s != '\\')
2235 break;
2236 *p++ = (unsigned char)*s++;
2237 }
2238 if (((s - bs) & 1) == 0 ||
2239 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002240 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241 continue;
2242 }
2243 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002244 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245 s++;
2246
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002247 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002248 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002249 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002250 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002252 endinpos = s-starts;
2253 if (unicode_decode_call_errorhandler(
2254 errors, &errorHandler,
2255 "rawunicodeescape", "truncated \\uXXXX",
2256 starts, size, &startinpos, &endinpos, &exc, &s,
2257 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002259 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002260 }
2261 x = (x<<4) & ~0xF;
2262 if (c >= '0' && c <= '9')
2263 x += c - '0';
2264 else if (c >= 'a' && c <= 'f')
2265 x += 10 + c - 'a';
2266 else
2267 x += 10 + c - 'A';
2268 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002269#ifndef Py_UNICODE_WIDE
2270 if (x > 0x10000) {
2271 if (unicode_decode_call_errorhandler(
2272 errors, &errorHandler,
2273 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2274 starts, size, &startinpos, &endinpos, &exc, &s,
2275 (PyObject **)&v, &outpos, &p))
2276 goto onError;
2277 }
2278#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002279 *p++ = x;
2280 nextByte:
2281 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002282 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002283 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002284 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002285 Py_XDECREF(errorHandler);
2286 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002287 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002288
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289 onError:
2290 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002291 Py_XDECREF(errorHandler);
2292 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002293 return NULL;
2294}
2295
2296PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002297 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298{
2299 PyObject *repr;
2300 char *p;
2301 char *q;
2302
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002303 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002305#ifdef Py_UNICODE_WIDE
2306 repr = PyString_FromStringAndSize(NULL, 10 * size);
2307#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002309#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310 if (repr == NULL)
2311 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002312 if (size == 0)
2313 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002314
2315 p = q = PyString_AS_STRING(repr);
2316 while (size-- > 0) {
2317 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002318#ifdef Py_UNICODE_WIDE
2319 /* Map 32-bit characters to '\Uxxxxxxxx' */
2320 if (ch >= 0x10000) {
2321 *p++ = '\\';
2322 *p++ = 'U';
2323 *p++ = hexdigit[(ch >> 28) & 0xf];
2324 *p++ = hexdigit[(ch >> 24) & 0xf];
2325 *p++ = hexdigit[(ch >> 20) & 0xf];
2326 *p++ = hexdigit[(ch >> 16) & 0xf];
2327 *p++ = hexdigit[(ch >> 12) & 0xf];
2328 *p++ = hexdigit[(ch >> 8) & 0xf];
2329 *p++ = hexdigit[(ch >> 4) & 0xf];
2330 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002331 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002332 else
2333#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334 /* Map 16-bit characters to '\uxxxx' */
2335 if (ch >= 256) {
2336 *p++ = '\\';
2337 *p++ = 'u';
2338 *p++ = hexdigit[(ch >> 12) & 0xf];
2339 *p++ = hexdigit[(ch >> 8) & 0xf];
2340 *p++ = hexdigit[(ch >> 4) & 0xf];
2341 *p++ = hexdigit[ch & 15];
2342 }
2343 /* Copy everything else as-is */
2344 else
2345 *p++ = (char) ch;
2346 }
2347 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002348 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349 return repr;
2350}
2351
2352PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2353{
2354 if (!PyUnicode_Check(unicode)) {
2355 PyErr_BadArgument();
2356 return NULL;
2357 }
2358 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2359 PyUnicode_GET_SIZE(unicode));
2360}
2361
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002362/* --- Unicode Internal Codec ------------------------------------------- */
2363
2364PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002365 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002366 const char *errors)
2367{
2368 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002369 Py_ssize_t startinpos;
2370 Py_ssize_t endinpos;
2371 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002372 PyUnicodeObject *v;
2373 Py_UNICODE *p;
2374 const char *end;
2375 const char *reason;
2376 PyObject *errorHandler = NULL;
2377 PyObject *exc = NULL;
2378
Neal Norwitzd43069c2006-01-08 01:12:10 +00002379#ifdef Py_UNICODE_WIDE
2380 Py_UNICODE unimax = PyUnicode_GetMax();
2381#endif
2382
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002383 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2384 if (v == NULL)
2385 goto onError;
2386 if (PyUnicode_GetSize((PyObject *)v) == 0)
2387 return (PyObject *)v;
2388 p = PyUnicode_AS_UNICODE(v);
2389 end = s + size;
2390
2391 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002392 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002393 /* We have to sanity check the raw data, otherwise doom looms for
2394 some malformed UCS-4 data. */
2395 if (
2396 #ifdef Py_UNICODE_WIDE
2397 *p > unimax || *p < 0 ||
2398 #endif
2399 end-s < Py_UNICODE_SIZE
2400 )
2401 {
2402 startinpos = s - starts;
2403 if (end-s < Py_UNICODE_SIZE) {
2404 endinpos = end-starts;
2405 reason = "truncated input";
2406 }
2407 else {
2408 endinpos = s - starts + Py_UNICODE_SIZE;
2409 reason = "illegal code point (> 0x10FFFF)";
2410 }
2411 outpos = p - PyUnicode_AS_UNICODE(v);
2412 if (unicode_decode_call_errorhandler(
2413 errors, &errorHandler,
2414 "unicode_internal", reason,
2415 starts, size, &startinpos, &endinpos, &exc, &s,
2416 (PyObject **)&v, &outpos, &p)) {
2417 goto onError;
2418 }
2419 }
2420 else {
2421 p++;
2422 s += Py_UNICODE_SIZE;
2423 }
2424 }
2425
Martin v. Löwis412fb672006-04-13 06:34:32 +00002426 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002427 goto onError;
2428 Py_XDECREF(errorHandler);
2429 Py_XDECREF(exc);
2430 return (PyObject *)v;
2431
2432 onError:
2433 Py_XDECREF(v);
2434 Py_XDECREF(errorHandler);
2435 Py_XDECREF(exc);
2436 return NULL;
2437}
2438
Guido van Rossumd57fd912000-03-10 22:53:23 +00002439/* --- Latin-1 Codec ------------------------------------------------------ */
2440
2441PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002442 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002443 const char *errors)
2444{
2445 PyUnicodeObject *v;
2446 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002447
Guido van Rossumd57fd912000-03-10 22:53:23 +00002448 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002449 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002450 Py_UNICODE r = *(unsigned char*)s;
2451 return PyUnicode_FromUnicode(&r, 1);
2452 }
2453
Guido van Rossumd57fd912000-03-10 22:53:23 +00002454 v = _PyUnicode_New(size);
2455 if (v == NULL)
2456 goto onError;
2457 if (size == 0)
2458 return (PyObject *)v;
2459 p = PyUnicode_AS_UNICODE(v);
2460 while (size-- > 0)
2461 *p++ = (unsigned char)*s++;
2462 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002463
Guido van Rossumd57fd912000-03-10 22:53:23 +00002464 onError:
2465 Py_XDECREF(v);
2466 return NULL;
2467}
2468
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002469/* create or adjust a UnicodeEncodeError */
2470static void make_encode_exception(PyObject **exceptionObject,
2471 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002472 const Py_UNICODE *unicode, Py_ssize_t size,
2473 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002474 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002475{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002476 if (*exceptionObject == NULL) {
2477 *exceptionObject = PyUnicodeEncodeError_Create(
2478 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479 }
2480 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002481 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2482 goto onError;
2483 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2484 goto onError;
2485 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2486 goto onError;
2487 return;
2488 onError:
2489 Py_DECREF(*exceptionObject);
2490 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 }
2492}
2493
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002494/* raises a UnicodeEncodeError */
2495static void raise_encode_exception(PyObject **exceptionObject,
2496 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002497 const Py_UNICODE *unicode, Py_ssize_t size,
2498 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002499 const char *reason)
2500{
2501 make_encode_exception(exceptionObject,
2502 encoding, unicode, size, startpos, endpos, reason);
2503 if (*exceptionObject != NULL)
2504 PyCodec_StrictErrors(*exceptionObject);
2505}
2506
2507/* error handling callback helper:
2508 build arguments, call the callback and check the arguments,
2509 put the result into newpos and return the replacement string, which
2510 has to be freed by the caller */
2511static PyObject *unicode_encode_call_errorhandler(const char *errors,
2512 PyObject **errorHandler,
2513 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002514 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2515 Py_ssize_t startpos, Py_ssize_t endpos,
2516 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002517{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002518 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002519
2520 PyObject *restuple;
2521 PyObject *resunicode;
2522
2523 if (*errorHandler == NULL) {
2524 *errorHandler = PyCodec_LookupError(errors);
2525 if (*errorHandler == NULL)
2526 return NULL;
2527 }
2528
2529 make_encode_exception(exceptionObject,
2530 encoding, unicode, size, startpos, endpos, reason);
2531 if (*exceptionObject == NULL)
2532 return NULL;
2533
2534 restuple = PyObject_CallFunctionObjArgs(
2535 *errorHandler, *exceptionObject, NULL);
2536 if (restuple == NULL)
2537 return NULL;
2538 if (!PyTuple_Check(restuple)) {
2539 PyErr_Format(PyExc_TypeError, &argparse[4]);
2540 Py_DECREF(restuple);
2541 return NULL;
2542 }
2543 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2544 &resunicode, newpos)) {
2545 Py_DECREF(restuple);
2546 return NULL;
2547 }
2548 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002549 *newpos = size+*newpos;
2550 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002551 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002552 Py_DECREF(restuple);
2553 return NULL;
2554 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002555 Py_INCREF(resunicode);
2556 Py_DECREF(restuple);
2557 return resunicode;
2558}
2559
2560static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002561 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002562 const char *errors,
2563 int limit)
2564{
2565 /* output object */
2566 PyObject *res;
2567 /* pointers to the beginning and end+1 of input */
2568 const Py_UNICODE *startp = p;
2569 const Py_UNICODE *endp = p + size;
2570 /* pointer to the beginning of the unencodable characters */
2571 /* const Py_UNICODE *badp = NULL; */
2572 /* pointer into the output */
2573 char *str;
2574 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002575 Py_ssize_t respos = 0;
2576 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002577 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2578 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002579 PyObject *errorHandler = NULL;
2580 PyObject *exc = NULL;
2581 /* the following variable is used for caching string comparisons
2582 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2583 int known_errorHandler = -1;
2584
2585 /* allocate enough for a simple encoding without
2586 replacements, if we need more, we'll resize */
2587 res = PyString_FromStringAndSize(NULL, size);
2588 if (res == NULL)
2589 goto onError;
2590 if (size == 0)
2591 return res;
2592 str = PyString_AS_STRING(res);
2593 ressize = size;
2594
2595 while (p<endp) {
2596 Py_UNICODE c = *p;
2597
2598 /* can we encode this? */
2599 if (c<limit) {
2600 /* no overflow check, because we know that the space is enough */
2601 *str++ = (char)c;
2602 ++p;
2603 }
2604 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002605 Py_ssize_t unicodepos = p-startp;
2606 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002607 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002608 Py_ssize_t repsize;
2609 Py_ssize_t newpos;
2610 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002611 Py_UNICODE *uni2;
2612 /* startpos for collecting unencodable chars */
2613 const Py_UNICODE *collstart = p;
2614 const Py_UNICODE *collend = p;
2615 /* find all unecodable characters */
2616 while ((collend < endp) && ((*collend)>=limit))
2617 ++collend;
2618 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2619 if (known_errorHandler==-1) {
2620 if ((errors==NULL) || (!strcmp(errors, "strict")))
2621 known_errorHandler = 1;
2622 else if (!strcmp(errors, "replace"))
2623 known_errorHandler = 2;
2624 else if (!strcmp(errors, "ignore"))
2625 known_errorHandler = 3;
2626 else if (!strcmp(errors, "xmlcharrefreplace"))
2627 known_errorHandler = 4;
2628 else
2629 known_errorHandler = 0;
2630 }
2631 switch (known_errorHandler) {
2632 case 1: /* strict */
2633 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2634 goto onError;
2635 case 2: /* replace */
2636 while (collstart++<collend)
2637 *str++ = '?'; /* fall through */
2638 case 3: /* ignore */
2639 p = collend;
2640 break;
2641 case 4: /* xmlcharrefreplace */
2642 respos = str-PyString_AS_STRING(res);
2643 /* determine replacement size (temporarily (mis)uses p) */
2644 for (p = collstart, repsize = 0; p < collend; ++p) {
2645 if (*p<10)
2646 repsize += 2+1+1;
2647 else if (*p<100)
2648 repsize += 2+2+1;
2649 else if (*p<1000)
2650 repsize += 2+3+1;
2651 else if (*p<10000)
2652 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002653#ifndef Py_UNICODE_WIDE
2654 else
2655 repsize += 2+5+1;
2656#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002657 else if (*p<100000)
2658 repsize += 2+5+1;
2659 else if (*p<1000000)
2660 repsize += 2+6+1;
2661 else
2662 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002663#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002664 }
2665 requiredsize = respos+repsize+(endp-collend);
2666 if (requiredsize > ressize) {
2667 if (requiredsize<2*ressize)
2668 requiredsize = 2*ressize;
2669 if (_PyString_Resize(&res, requiredsize))
2670 goto onError;
2671 str = PyString_AS_STRING(res) + respos;
2672 ressize = requiredsize;
2673 }
2674 /* generate replacement (temporarily (mis)uses p) */
2675 for (p = collstart; p < collend; ++p) {
2676 str += sprintf(str, "&#%d;", (int)*p);
2677 }
2678 p = collend;
2679 break;
2680 default:
2681 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2682 encoding, reason, startp, size, &exc,
2683 collstart-startp, collend-startp, &newpos);
2684 if (repunicode == NULL)
2685 goto onError;
2686 /* need more space? (at least enough for what we
2687 have+the replacement+the rest of the string, so
2688 we won't have to check space for encodable characters) */
2689 respos = str-PyString_AS_STRING(res);
2690 repsize = PyUnicode_GET_SIZE(repunicode);
2691 requiredsize = respos+repsize+(endp-collend);
2692 if (requiredsize > ressize) {
2693 if (requiredsize<2*ressize)
2694 requiredsize = 2*ressize;
2695 if (_PyString_Resize(&res, requiredsize)) {
2696 Py_DECREF(repunicode);
2697 goto onError;
2698 }
2699 str = PyString_AS_STRING(res) + respos;
2700 ressize = requiredsize;
2701 }
2702 /* check if there is anything unencodable in the replacement
2703 and copy it to the output */
2704 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2705 c = *uni2;
2706 if (c >= limit) {
2707 raise_encode_exception(&exc, encoding, startp, size,
2708 unicodepos, unicodepos+1, reason);
2709 Py_DECREF(repunicode);
2710 goto onError;
2711 }
2712 *str = (char)c;
2713 }
2714 p = startp + newpos;
2715 Py_DECREF(repunicode);
2716 }
2717 }
2718 }
2719 /* Resize if we allocated to much */
2720 respos = str-PyString_AS_STRING(res);
2721 if (respos<ressize)
2722 /* If this falls res will be NULL */
2723 _PyString_Resize(&res, respos);
2724 Py_XDECREF(errorHandler);
2725 Py_XDECREF(exc);
2726 return res;
2727
2728 onError:
2729 Py_XDECREF(res);
2730 Py_XDECREF(errorHandler);
2731 Py_XDECREF(exc);
2732 return NULL;
2733}
2734
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002736 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737 const char *errors)
2738{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002739 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740}
2741
2742PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2743{
2744 if (!PyUnicode_Check(unicode)) {
2745 PyErr_BadArgument();
2746 return NULL;
2747 }
2748 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2749 PyUnicode_GET_SIZE(unicode),
2750 NULL);
2751}
2752
2753/* --- 7-bit ASCII Codec -------------------------------------------------- */
2754
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002756 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757 const char *errors)
2758{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002759 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760 PyUnicodeObject *v;
2761 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002762 Py_ssize_t startinpos;
2763 Py_ssize_t endinpos;
2764 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002765 const char *e;
2766 PyObject *errorHandler = NULL;
2767 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002768
Guido van Rossumd57fd912000-03-10 22:53:23 +00002769 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002770 if (size == 1 && *(unsigned char*)s < 128) {
2771 Py_UNICODE r = *(unsigned char*)s;
2772 return PyUnicode_FromUnicode(&r, 1);
2773 }
Tim Petersced69f82003-09-16 20:30:58 +00002774
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775 v = _PyUnicode_New(size);
2776 if (v == NULL)
2777 goto onError;
2778 if (size == 0)
2779 return (PyObject *)v;
2780 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002781 e = s + size;
2782 while (s < e) {
2783 register unsigned char c = (unsigned char)*s;
2784 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002786 ++s;
2787 }
2788 else {
2789 startinpos = s-starts;
2790 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002791 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002792 if (unicode_decode_call_errorhandler(
2793 errors, &errorHandler,
2794 "ascii", "ordinal not in range(128)",
2795 starts, size, &startinpos, &endinpos, &exc, &s,
2796 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002798 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002800 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002801 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002802 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002803 Py_XDECREF(errorHandler);
2804 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002806
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807 onError:
2808 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002809 Py_XDECREF(errorHandler);
2810 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 return NULL;
2812}
2813
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002815 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 const char *errors)
2817{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002818 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819}
2820
2821PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2822{
2823 if (!PyUnicode_Check(unicode)) {
2824 PyErr_BadArgument();
2825 return NULL;
2826 }
2827 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2828 PyUnicode_GET_SIZE(unicode),
2829 NULL);
2830}
2831
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002832#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002833
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002834/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002835
Martin v. Löwisd8251432006-06-14 05:21:04 +00002836#if SIZEOF_INT < SIZEOF_SSIZE_T
2837#define NEED_RETRY
2838#endif
2839
2840/* XXX This code is limited to "true" double-byte encodings, as
2841 a) it assumes an incomplete character consists of a single byte, and
2842 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2843 encodings, see IsDBCSLeadByteEx documentation. */
2844
2845static int is_dbcs_lead_byte(const char *s, int offset)
2846{
2847 const char *curr = s + offset;
2848
2849 if (IsDBCSLeadByte(*curr)) {
2850 const char *prev = CharPrev(s, curr);
2851 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2852 }
2853 return 0;
2854}
2855
2856/*
2857 * Decode MBCS string into unicode object. If 'final' is set, converts
2858 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2859 */
2860static int decode_mbcs(PyUnicodeObject **v,
2861 const char *s, /* MBCS string */
2862 int size, /* sizeof MBCS string */
2863 int final)
2864{
2865 Py_UNICODE *p;
2866 Py_ssize_t n = 0;
2867 int usize = 0;
2868
2869 assert(size >= 0);
2870
2871 /* Skip trailing lead-byte unless 'final' is set */
2872 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2873 --size;
2874
2875 /* First get the size of the result */
2876 if (size > 0) {
2877 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2878 if (usize == 0) {
2879 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2880 return -1;
2881 }
2882 }
2883
2884 if (*v == NULL) {
2885 /* Create unicode object */
2886 *v = _PyUnicode_New(usize);
2887 if (*v == NULL)
2888 return -1;
2889 }
2890 else {
2891 /* Extend unicode object */
2892 n = PyUnicode_GET_SIZE(*v);
2893 if (_PyUnicode_Resize(v, n + usize) < 0)
2894 return -1;
2895 }
2896
2897 /* Do the conversion */
2898 if (size > 0) {
2899 p = PyUnicode_AS_UNICODE(*v) + n;
2900 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2901 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2902 return -1;
2903 }
2904 }
2905
2906 return size;
2907}
2908
2909PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2910 Py_ssize_t size,
2911 const char *errors,
2912 Py_ssize_t *consumed)
2913{
2914 PyUnicodeObject *v = NULL;
2915 int done;
2916
2917 if (consumed)
2918 *consumed = 0;
2919
2920#ifdef NEED_RETRY
2921 retry:
2922 if (size > INT_MAX)
2923 done = decode_mbcs(&v, s, INT_MAX, 0);
2924 else
2925#endif
2926 done = decode_mbcs(&v, s, (int)size, !consumed);
2927
2928 if (done < 0) {
2929 Py_XDECREF(v);
2930 return NULL;
2931 }
2932
2933 if (consumed)
2934 *consumed += done;
2935
2936#ifdef NEED_RETRY
2937 if (size > INT_MAX) {
2938 s += done;
2939 size -= done;
2940 goto retry;
2941 }
2942#endif
2943
2944 return (PyObject *)v;
2945}
2946
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002947PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002948 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002949 const char *errors)
2950{
Martin v. Löwisd8251432006-06-14 05:21:04 +00002951 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2952}
2953
2954/*
2955 * Convert unicode into string object (MBCS).
2956 * Returns 0 if succeed, -1 otherwise.
2957 */
2958static int encode_mbcs(PyObject **repr,
2959 const Py_UNICODE *p, /* unicode */
2960 int size) /* size of unicode */
2961{
2962 int mbcssize = 0;
2963 Py_ssize_t n = 0;
2964
2965 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002966
2967 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00002968 if (size > 0) {
2969 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2970 if (mbcssize == 0) {
2971 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2972 return -1;
2973 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002974 }
2975
Martin v. Löwisd8251432006-06-14 05:21:04 +00002976 if (*repr == NULL) {
2977 /* Create string object */
2978 *repr = PyString_FromStringAndSize(NULL, mbcssize);
2979 if (*repr == NULL)
2980 return -1;
2981 }
2982 else {
2983 /* Extend string object */
2984 n = PyString_Size(*repr);
2985 if (_PyString_Resize(repr, n + mbcssize) < 0)
2986 return -1;
2987 }
2988
2989 /* Do the conversion */
2990 if (size > 0) {
2991 char *s = PyString_AS_STRING(*repr) + n;
2992 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2993 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2994 return -1;
2995 }
2996 }
2997
2998 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002999}
3000
3001PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003002 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003003 const char *errors)
3004{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003005 PyObject *repr = NULL;
3006 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003007
Martin v. Löwisd8251432006-06-14 05:21:04 +00003008#ifdef NEED_RETRY
3009 retry:
3010 if (size > INT_MAX)
3011 ret = encode_mbcs(&repr, p, INT_MAX);
3012 else
3013#endif
3014 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003015
Martin v. Löwisd8251432006-06-14 05:21:04 +00003016 if (ret < 0) {
3017 Py_XDECREF(repr);
3018 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003019 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003020
3021#ifdef NEED_RETRY
3022 if (size > INT_MAX) {
3023 p += INT_MAX;
3024 size -= INT_MAX;
3025 goto retry;
3026 }
3027#endif
3028
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003029 return repr;
3030}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003031
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003032PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3033{
3034 if (!PyUnicode_Check(unicode)) {
3035 PyErr_BadArgument();
3036 return NULL;
3037 }
3038 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3039 PyUnicode_GET_SIZE(unicode),
3040 NULL);
3041}
3042
Martin v. Löwisd8251432006-06-14 05:21:04 +00003043#undef NEED_RETRY
3044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003045#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003046
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047/* --- Character Mapping Codec -------------------------------------------- */
3048
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003050 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 PyObject *mapping,
3052 const char *errors)
3053{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003054 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003055 Py_ssize_t startinpos;
3056 Py_ssize_t endinpos;
3057 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003058 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059 PyUnicodeObject *v;
3060 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003061 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003062 PyObject *errorHandler = NULL;
3063 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003064 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003065 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003066
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 /* Default to Latin-1 */
3068 if (mapping == NULL)
3069 return PyUnicode_DecodeLatin1(s, size, errors);
3070
3071 v = _PyUnicode_New(size);
3072 if (v == NULL)
3073 goto onError;
3074 if (size == 0)
3075 return (PyObject *)v;
3076 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003077 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003078 if (PyUnicode_CheckExact(mapping)) {
3079 mapstring = PyUnicode_AS_UNICODE(mapping);
3080 maplen = PyUnicode_GET_SIZE(mapping);
3081 while (s < e) {
3082 unsigned char ch = *s;
3083 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003085 if (ch < maplen)
3086 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003088 if (x == 0xfffe) {
3089 /* undefined mapping */
3090 outpos = p-PyUnicode_AS_UNICODE(v);
3091 startinpos = s-starts;
3092 endinpos = startinpos+1;
3093 if (unicode_decode_call_errorhandler(
3094 errors, &errorHandler,
3095 "charmap", "character maps to <undefined>",
3096 starts, size, &startinpos, &endinpos, &exc, &s,
3097 (PyObject **)&v, &outpos, &p)) {
3098 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003099 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003100 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003101 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003102 *p++ = x;
3103 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003105 }
3106 else {
3107 while (s < e) {
3108 unsigned char ch = *s;
3109 PyObject *w, *x;
3110
3111 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3112 w = PyInt_FromLong((long)ch);
3113 if (w == NULL)
3114 goto onError;
3115 x = PyObject_GetItem(mapping, w);
3116 Py_DECREF(w);
3117 if (x == NULL) {
3118 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3119 /* No mapping found means: mapping is undefined. */
3120 PyErr_Clear();
3121 x = Py_None;
3122 Py_INCREF(x);
3123 } else
3124 goto onError;
3125 }
3126
3127 /* Apply mapping */
3128 if (PyInt_Check(x)) {
3129 long value = PyInt_AS_LONG(x);
3130 if (value < 0 || value > 65535) {
3131 PyErr_SetString(PyExc_TypeError,
3132 "character mapping must be in range(65536)");
3133 Py_DECREF(x);
3134 goto onError;
3135 }
3136 *p++ = (Py_UNICODE)value;
3137 }
3138 else if (x == Py_None) {
3139 /* undefined mapping */
3140 outpos = p-PyUnicode_AS_UNICODE(v);
3141 startinpos = s-starts;
3142 endinpos = startinpos+1;
3143 if (unicode_decode_call_errorhandler(
3144 errors, &errorHandler,
3145 "charmap", "character maps to <undefined>",
3146 starts, size, &startinpos, &endinpos, &exc, &s,
3147 (PyObject **)&v, &outpos, &p)) {
3148 Py_DECREF(x);
3149 goto onError;
3150 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003151 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003152 continue;
3153 }
3154 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003155 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003156
3157 if (targetsize == 1)
3158 /* 1-1 mapping */
3159 *p++ = *PyUnicode_AS_UNICODE(x);
3160
3161 else if (targetsize > 1) {
3162 /* 1-n mapping */
3163 if (targetsize > extrachars) {
3164 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003165 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3166 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003167 (targetsize << 2);
3168 extrachars += needed;
3169 if (_PyUnicode_Resize(&v,
3170 PyUnicode_GET_SIZE(v) + needed) < 0) {
3171 Py_DECREF(x);
3172 goto onError;
3173 }
3174 p = PyUnicode_AS_UNICODE(v) + oldpos;
3175 }
3176 Py_UNICODE_COPY(p,
3177 PyUnicode_AS_UNICODE(x),
3178 targetsize);
3179 p += targetsize;
3180 extrachars -= targetsize;
3181 }
3182 /* 1-0 mapping: skip the character */
3183 }
3184 else {
3185 /* wrong return value */
3186 PyErr_SetString(PyExc_TypeError,
3187 "character mapping must return integer, None or unicode");
3188 Py_DECREF(x);
3189 goto onError;
3190 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003192 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003193 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003194 }
3195 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003196 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003198 Py_XDECREF(errorHandler);
3199 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003201
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003203 Py_XDECREF(errorHandler);
3204 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205 Py_XDECREF(v);
3206 return NULL;
3207}
3208
Martin v. Löwis3f767792006-06-04 19:36:28 +00003209/* Charmap encoding: the lookup table */
3210
3211struct encoding_map{
3212 PyObject_HEAD
3213 unsigned char level1[32];
3214 int count2, count3;
3215 unsigned char level23[1];
3216};
3217
3218static PyObject*
3219encoding_map_size(PyObject *obj, PyObject* args)
3220{
3221 struct encoding_map *map = (struct encoding_map*)obj;
3222 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3223 128*map->count3);
3224}
3225
3226static PyMethodDef encoding_map_methods[] = {
3227 {"size", encoding_map_size, METH_NOARGS,
3228 PyDoc_STR("Return the size (in bytes) of this object") },
3229 { 0 }
3230};
3231
3232static void
3233encoding_map_dealloc(PyObject* o)
3234{
3235 PyObject_FREE(o);
3236}
3237
3238static PyTypeObject EncodingMapType = {
3239 PyObject_HEAD_INIT(NULL)
3240 0, /*ob_size*/
3241 "EncodingMap", /*tp_name*/
3242 sizeof(struct encoding_map), /*tp_basicsize*/
3243 0, /*tp_itemsize*/
3244 /* methods */
3245 encoding_map_dealloc, /*tp_dealloc*/
3246 0, /*tp_print*/
3247 0, /*tp_getattr*/
3248 0, /*tp_setattr*/
3249 0, /*tp_compare*/
3250 0, /*tp_repr*/
3251 0, /*tp_as_number*/
3252 0, /*tp_as_sequence*/
3253 0, /*tp_as_mapping*/
3254 0, /*tp_hash*/
3255 0, /*tp_call*/
3256 0, /*tp_str*/
3257 0, /*tp_getattro*/
3258 0, /*tp_setattro*/
3259 0, /*tp_as_buffer*/
3260 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3261 0, /*tp_doc*/
3262 0, /*tp_traverse*/
3263 0, /*tp_clear*/
3264 0, /*tp_richcompare*/
3265 0, /*tp_weaklistoffset*/
3266 0, /*tp_iter*/
3267 0, /*tp_iternext*/
3268 encoding_map_methods, /*tp_methods*/
3269 0, /*tp_members*/
3270 0, /*tp_getset*/
3271 0, /*tp_base*/
3272 0, /*tp_dict*/
3273 0, /*tp_descr_get*/
3274 0, /*tp_descr_set*/
3275 0, /*tp_dictoffset*/
3276 0, /*tp_init*/
3277 0, /*tp_alloc*/
3278 0, /*tp_new*/
3279 0, /*tp_free*/
3280 0, /*tp_is_gc*/
3281};
3282
3283PyObject*
3284PyUnicode_BuildEncodingMap(PyObject* string)
3285{
3286 Py_UNICODE *decode;
3287 PyObject *result;
3288 struct encoding_map *mresult;
3289 int i;
3290 int need_dict = 0;
3291 unsigned char level1[32];
3292 unsigned char level2[512];
3293 unsigned char *mlevel1, *mlevel2, *mlevel3;
3294 int count2 = 0, count3 = 0;
3295
3296 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3297 PyErr_BadArgument();
3298 return NULL;
3299 }
3300 decode = PyUnicode_AS_UNICODE(string);
3301 memset(level1, 0xFF, sizeof level1);
3302 memset(level2, 0xFF, sizeof level2);
3303
3304 /* If there isn't a one-to-one mapping of NULL to \0,
3305 or if there are non-BMP characters, we need to use
3306 a mapping dictionary. */
3307 if (decode[0] != 0)
3308 need_dict = 1;
3309 for (i = 1; i < 256; i++) {
3310 int l1, l2;
3311 if (decode[i] == 0
3312 #ifdef Py_UNICODE_WIDE
3313 || decode[i] > 0xFFFF
3314 #endif
3315 ) {
3316 need_dict = 1;
3317 break;
3318 }
3319 if (decode[i] == 0xFFFE)
3320 /* unmapped character */
3321 continue;
3322 l1 = decode[i] >> 11;
3323 l2 = decode[i] >> 7;
3324 if (level1[l1] == 0xFF)
3325 level1[l1] = count2++;
3326 if (level2[l2] == 0xFF)
3327 level2[l2] = count3++;
3328 }
3329
3330 if (count2 >= 0xFF || count3 >= 0xFF)
3331 need_dict = 1;
3332
3333 if (need_dict) {
3334 PyObject *result = PyDict_New();
3335 PyObject *key, *value;
3336 if (!result)
3337 return NULL;
3338 for (i = 0; i < 256; i++) {
3339 key = value = NULL;
3340 key = PyInt_FromLong(decode[i]);
3341 value = PyInt_FromLong(i);
3342 if (!key || !value)
3343 goto failed1;
3344 if (PyDict_SetItem(result, key, value) == -1)
3345 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00003346 Py_DECREF(key);
3347 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003348 }
3349 return result;
3350 failed1:
3351 Py_XDECREF(key);
3352 Py_XDECREF(value);
3353 Py_DECREF(result);
3354 return NULL;
3355 }
3356
3357 /* Create a three-level trie */
3358 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3359 16*count2 + 128*count3 - 1);
3360 if (!result)
3361 return PyErr_NoMemory();
3362 PyObject_Init(result, &EncodingMapType);
3363 mresult = (struct encoding_map*)result;
3364 mresult->count2 = count2;
3365 mresult->count3 = count3;
3366 mlevel1 = mresult->level1;
3367 mlevel2 = mresult->level23;
3368 mlevel3 = mresult->level23 + 16*count2;
3369 memcpy(mlevel1, level1, 32);
3370 memset(mlevel2, 0xFF, 16*count2);
3371 memset(mlevel3, 0, 128*count3);
3372 count3 = 0;
3373 for (i = 1; i < 256; i++) {
3374 int o1, o2, o3, i2, i3;
3375 if (decode[i] == 0xFFFE)
3376 /* unmapped character */
3377 continue;
3378 o1 = decode[i]>>11;
3379 o2 = (decode[i]>>7) & 0xF;
3380 i2 = 16*mlevel1[o1] + o2;
3381 if (mlevel2[i2] == 0xFF)
3382 mlevel2[i2] = count3++;
3383 o3 = decode[i] & 0x7F;
3384 i3 = 128*mlevel2[i2] + o3;
3385 mlevel3[i3] = i;
3386 }
3387 return result;
3388}
3389
3390static int
3391encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3392{
3393 struct encoding_map *map = (struct encoding_map*)mapping;
3394 int l1 = c>>11;
3395 int l2 = (c>>7) & 0xF;
3396 int l3 = c & 0x7F;
3397 int i;
3398
3399#ifdef Py_UNICODE_WIDE
3400 if (c > 0xFFFF) {
3401 return -1;
3402 }
3403#endif
3404 if (c == 0)
3405 return 0;
3406 /* level 1*/
3407 i = map->level1[l1];
3408 if (i == 0xFF) {
3409 return -1;
3410 }
3411 /* level 2*/
3412 i = map->level23[16*i+l2];
3413 if (i == 0xFF) {
3414 return -1;
3415 }
3416 /* level 3 */
3417 i = map->level23[16*map->count2 + 128*i + l3];
3418 if (i == 0) {
3419 return -1;
3420 }
3421 return i;
3422}
3423
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003424/* Lookup the character ch in the mapping. If the character
3425 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003426 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003427static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003428{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429 PyObject *w = PyInt_FromLong((long)c);
3430 PyObject *x;
3431
3432 if (w == NULL)
3433 return NULL;
3434 x = PyObject_GetItem(mapping, w);
3435 Py_DECREF(w);
3436 if (x == NULL) {
3437 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3438 /* No mapping found means: mapping is undefined. */
3439 PyErr_Clear();
3440 x = Py_None;
3441 Py_INCREF(x);
3442 return x;
3443 } else
3444 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003446 else if (x == Py_None)
3447 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003448 else if (PyInt_Check(x)) {
3449 long value = PyInt_AS_LONG(x);
3450 if (value < 0 || value > 255) {
3451 PyErr_SetString(PyExc_TypeError,
3452 "character mapping must be in range(256)");
3453 Py_DECREF(x);
3454 return NULL;
3455 }
3456 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003458 else if (PyString_Check(x))
3459 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003461 /* wrong return value */
3462 PyErr_SetString(PyExc_TypeError,
3463 "character mapping must return integer, None or str");
3464 Py_DECREF(x);
3465 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 }
3467}
3468
Martin v. Löwis3f767792006-06-04 19:36:28 +00003469static int
3470charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3471{
3472 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3473 /* exponentially overallocate to minimize reallocations */
3474 if (requiredsize < 2*outsize)
3475 requiredsize = 2*outsize;
3476 if (_PyString_Resize(outobj, requiredsize)) {
3477 return 0;
3478 }
3479 return 1;
3480}
3481
3482typedef enum charmapencode_result {
3483 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3484}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003485/* lookup the character, put the result in the output string and adjust
3486 various state variables. Reallocate the output string if not enough
3487 space is available. Return a new reference to the object that
3488 was put in the output buffer, or Py_None, if the mapping was undefined
3489 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003490 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003491static
Martin v. Löwis3f767792006-06-04 19:36:28 +00003492charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003493 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003494{
Martin v. Löwis3f767792006-06-04 19:36:28 +00003495 PyObject *rep;
3496 char *outstart;
3497 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498
Martin v. Löwis3f767792006-06-04 19:36:28 +00003499 if (mapping->ob_type == &EncodingMapType) {
3500 int res = encoding_map_lookup(c, mapping);
3501 Py_ssize_t requiredsize = *outpos+1;
3502 if (res == -1)
3503 return enc_FAILED;
3504 if (outsize<requiredsize)
3505 if (!charmapencode_resize(outobj, outpos, requiredsize))
3506 return enc_EXCEPTION;
3507 outstart = PyString_AS_STRING(*outobj);
3508 outstart[(*outpos)++] = (char)res;
3509 return enc_SUCCESS;
3510 }
3511
3512 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003513 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003514 return enc_EXCEPTION;
3515 else if (rep==Py_None) {
3516 Py_DECREF(rep);
3517 return enc_FAILED;
3518 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003519 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003520 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003521 if (outsize<requiredsize)
3522 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003524 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003526 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3528 }
3529 else {
3530 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003531 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3532 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003533 if (outsize<requiredsize)
3534 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003536 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003538 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 memcpy(outstart + *outpos, repchars, repsize);
3540 *outpos += repsize;
3541 }
3542 }
Georg Brandl9f167602006-06-04 21:46:16 +00003543 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003544 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545}
3546
3547/* handle an error in PyUnicode_EncodeCharmap
3548 Return 0 on success, -1 on error */
3549static
3550int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003551 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003552 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003553 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003554 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555{
3556 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003557 Py_ssize_t repsize;
3558 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 Py_UNICODE *uni2;
3560 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003561 Py_ssize_t collstartpos = *inpos;
3562 Py_ssize_t collendpos = *inpos+1;
3563 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003564 char *encoding = "charmap";
3565 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00003566 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003568 /* find all unencodable characters */
3569 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003570 PyObject *rep;
3571 if (mapping->ob_type == &EncodingMapType) {
3572 int res = encoding_map_lookup(p[collendpos], mapping);
3573 if (res != -1)
3574 break;
3575 ++collendpos;
3576 continue;
3577 }
3578
3579 rep = charmapencode_lookup(p[collendpos], mapping);
3580 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003582 else if (rep!=Py_None) {
3583 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 break;
3585 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003586 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587 ++collendpos;
3588 }
3589 /* cache callback name lookup
3590 * (if not done yet, i.e. it's the first error) */
3591 if (*known_errorHandler==-1) {
3592 if ((errors==NULL) || (!strcmp(errors, "strict")))
3593 *known_errorHandler = 1;
3594 else if (!strcmp(errors, "replace"))
3595 *known_errorHandler = 2;
3596 else if (!strcmp(errors, "ignore"))
3597 *known_errorHandler = 3;
3598 else if (!strcmp(errors, "xmlcharrefreplace"))
3599 *known_errorHandler = 4;
3600 else
3601 *known_errorHandler = 0;
3602 }
3603 switch (*known_errorHandler) {
3604 case 1: /* strict */
3605 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3606 return -1;
3607 case 2: /* replace */
3608 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3609 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003610 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 return -1;
3612 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003613 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3615 return -1;
3616 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003617 }
3618 /* fall through */
3619 case 3: /* ignore */
3620 *inpos = collendpos;
3621 break;
3622 case 4: /* xmlcharrefreplace */
3623 /* generate replacement (temporarily (mis)uses p) */
3624 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3625 char buffer[2+29+1+1];
3626 char *cp;
3627 sprintf(buffer, "&#%d;", (int)p[collpos]);
3628 for (cp = buffer; *cp; ++cp) {
3629 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003630 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003631 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003632 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3634 return -1;
3635 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003636 }
3637 }
3638 *inpos = collendpos;
3639 break;
3640 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003641 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003642 encoding, reason, p, size, exceptionObject,
3643 collstartpos, collendpos, &newpos);
3644 if (repunicode == NULL)
3645 return -1;
3646 /* generate replacement */
3647 repsize = PyUnicode_GET_SIZE(repunicode);
3648 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3649 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003650 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003651 return -1;
3652 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003653 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003654 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3656 return -1;
3657 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658 }
3659 *inpos = newpos;
3660 Py_DECREF(repunicode);
3661 }
3662 return 0;
3663}
3664
Guido van Rossumd57fd912000-03-10 22:53:23 +00003665PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003666 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003667 PyObject *mapping,
3668 const char *errors)
3669{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003670 /* output object */
3671 PyObject *res = NULL;
3672 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003673 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003674 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003675 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 PyObject *errorHandler = NULL;
3677 PyObject *exc = NULL;
3678 /* the following variable is used for caching string comparisons
3679 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3680 * 3=ignore, 4=xmlcharrefreplace */
3681 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682
3683 /* Default to Latin-1 */
3684 if (mapping == NULL)
3685 return PyUnicode_EncodeLatin1(p, size, errors);
3686
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003687 /* allocate enough for a simple encoding without
3688 replacements, if we need more, we'll resize */
3689 res = PyString_FromStringAndSize(NULL, size);
3690 if (res == NULL)
3691 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003692 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003693 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 while (inpos<size) {
3696 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00003697 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3698 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003700 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003701 if (charmap_encoding_error(p, size, &inpos, mapping,
3702 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003703 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003704 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003705 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003706 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003708 else
3709 /* done with this character => adjust input position */
3710 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003713 /* Resize if we allocated to much */
3714 if (respos<PyString_GET_SIZE(res)) {
3715 if (_PyString_Resize(&res, respos))
3716 goto onError;
3717 }
3718 Py_XDECREF(exc);
3719 Py_XDECREF(errorHandler);
3720 return res;
3721
3722 onError:
3723 Py_XDECREF(res);
3724 Py_XDECREF(exc);
3725 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003726 return NULL;
3727}
3728
3729PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3730 PyObject *mapping)
3731{
3732 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3733 PyErr_BadArgument();
3734 return NULL;
3735 }
3736 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3737 PyUnicode_GET_SIZE(unicode),
3738 mapping,
3739 NULL);
3740}
3741
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003742/* create or adjust a UnicodeTranslateError */
3743static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003744 const Py_UNICODE *unicode, Py_ssize_t size,
3745 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003746 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003748 if (*exceptionObject == NULL) {
3749 *exceptionObject = PyUnicodeTranslateError_Create(
3750 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751 }
3752 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003753 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3754 goto onError;
3755 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3756 goto onError;
3757 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3758 goto onError;
3759 return;
3760 onError:
3761 Py_DECREF(*exceptionObject);
3762 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763 }
3764}
3765
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003766/* raises a UnicodeTranslateError */
3767static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003768 const Py_UNICODE *unicode, Py_ssize_t size,
3769 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003770 const char *reason)
3771{
3772 make_translate_exception(exceptionObject,
3773 unicode, size, startpos, endpos, reason);
3774 if (*exceptionObject != NULL)
3775 PyCodec_StrictErrors(*exceptionObject);
3776}
3777
3778/* error handling callback helper:
3779 build arguments, call the callback and check the arguments,
3780 put the result into newpos and return the replacement string, which
3781 has to be freed by the caller */
3782static PyObject *unicode_translate_call_errorhandler(const char *errors,
3783 PyObject **errorHandler,
3784 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003785 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3786 Py_ssize_t startpos, Py_ssize_t endpos,
3787 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003789 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003790
Martin v. Löwis412fb672006-04-13 06:34:32 +00003791 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003792 PyObject *restuple;
3793 PyObject *resunicode;
3794
3795 if (*errorHandler == NULL) {
3796 *errorHandler = PyCodec_LookupError(errors);
3797 if (*errorHandler == NULL)
3798 return NULL;
3799 }
3800
3801 make_translate_exception(exceptionObject,
3802 unicode, size, startpos, endpos, reason);
3803 if (*exceptionObject == NULL)
3804 return NULL;
3805
3806 restuple = PyObject_CallFunctionObjArgs(
3807 *errorHandler, *exceptionObject, NULL);
3808 if (restuple == NULL)
3809 return NULL;
3810 if (!PyTuple_Check(restuple)) {
3811 PyErr_Format(PyExc_TypeError, &argparse[4]);
3812 Py_DECREF(restuple);
3813 return NULL;
3814 }
3815 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003816 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003817 Py_DECREF(restuple);
3818 return NULL;
3819 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003820 if (i_newpos<0)
3821 *newpos = size+i_newpos;
3822 else
3823 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003824 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003825 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003826 Py_DECREF(restuple);
3827 return NULL;
3828 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003829 Py_INCREF(resunicode);
3830 Py_DECREF(restuple);
3831 return resunicode;
3832}
3833
3834/* Lookup the character ch in the mapping and put the result in result,
3835 which must be decrefed by the caller.
3836 Return 0 on success, -1 on error */
3837static
3838int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3839{
3840 PyObject *w = PyInt_FromLong((long)c);
3841 PyObject *x;
3842
3843 if (w == NULL)
3844 return -1;
3845 x = PyObject_GetItem(mapping, w);
3846 Py_DECREF(w);
3847 if (x == NULL) {
3848 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3849 /* No mapping found means: use 1:1 mapping. */
3850 PyErr_Clear();
3851 *result = NULL;
3852 return 0;
3853 } else
3854 return -1;
3855 }
3856 else if (x == Py_None) {
3857 *result = x;
3858 return 0;
3859 }
3860 else if (PyInt_Check(x)) {
3861 long value = PyInt_AS_LONG(x);
3862 long max = PyUnicode_GetMax();
3863 if (value < 0 || value > max) {
3864 PyErr_Format(PyExc_TypeError,
3865 "character mapping must be in range(0x%lx)", max+1);
3866 Py_DECREF(x);
3867 return -1;
3868 }
3869 *result = x;
3870 return 0;
3871 }
3872 else if (PyUnicode_Check(x)) {
3873 *result = x;
3874 return 0;
3875 }
3876 else {
3877 /* wrong return value */
3878 PyErr_SetString(PyExc_TypeError,
3879 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003880 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881 return -1;
3882 }
3883}
3884/* ensure that *outobj is at least requiredsize characters long,
3885if not reallocate and adjust various state variables.
3886Return 0 on success, -1 on error */
3887static
Walter Dörwald4894c302003-10-24 14:25:28 +00003888int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003889 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003890{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003891 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003892 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003893 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003894 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003895 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003896 if (requiredsize < 2 * oldsize)
3897 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003898 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003899 return -1;
3900 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003901 }
3902 return 0;
3903}
3904/* lookup the character, put the result in the output string and adjust
3905 various state variables. Return a new reference to the object that
3906 was put in the output buffer in *result, or Py_None, if the mapping was
3907 undefined (in which case no character was written).
3908 The called must decref result.
3909 Return 0 on success, -1 on error. */
3910static
Walter Dörwald4894c302003-10-24 14:25:28 +00003911int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003912 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003913 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914{
Walter Dörwald4894c302003-10-24 14:25:28 +00003915 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003916 return -1;
3917 if (*res==NULL) {
3918 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003919 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003920 }
3921 else if (*res==Py_None)
3922 ;
3923 else if (PyInt_Check(*res)) {
3924 /* no overflow check, because we know that the space is enough */
3925 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3926 }
3927 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003928 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 if (repsize==1) {
3930 /* no overflow check, because we know that the space is enough */
3931 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3932 }
3933 else if (repsize!=0) {
3934 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003935 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003936 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003937 repsize - 1;
3938 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003939 return -1;
3940 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3941 *outp += repsize;
3942 }
3943 }
3944 else
3945 return -1;
3946 return 0;
3947}
3948
3949PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003950 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951 PyObject *mapping,
3952 const char *errors)
3953{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003954 /* output object */
3955 PyObject *res = NULL;
3956 /* pointers to the beginning and end+1 of input */
3957 const Py_UNICODE *startp = p;
3958 const Py_UNICODE *endp = p + size;
3959 /* pointer into the output */
3960 Py_UNICODE *str;
3961 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003962 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003963 char *reason = "character maps to <undefined>";
3964 PyObject *errorHandler = NULL;
3965 PyObject *exc = NULL;
3966 /* the following variable is used for caching string comparisons
3967 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3968 * 3=ignore, 4=xmlcharrefreplace */
3969 int known_errorHandler = -1;
3970
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971 if (mapping == NULL) {
3972 PyErr_BadArgument();
3973 return NULL;
3974 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003975
3976 /* allocate enough for a simple 1:1 translation without
3977 replacements, if we need more, we'll resize */
3978 res = PyUnicode_FromUnicode(NULL, size);
3979 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003980 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003981 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003982 return res;
3983 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 while (p<endp) {
3986 /* try to encode it */
3987 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003988 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003989 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003990 goto onError;
3991 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003992 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003993 if (x!=Py_None) /* it worked => adjust input pointer */
3994 ++p;
3995 else { /* untranslatable character */
3996 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003997 Py_ssize_t repsize;
3998 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003999 Py_UNICODE *uni2;
4000 /* startpos for collecting untranslatable chars */
4001 const Py_UNICODE *collstart = p;
4002 const Py_UNICODE *collend = p+1;
4003 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005 /* find all untranslatable characters */
4006 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004007 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004008 goto onError;
4009 Py_XDECREF(x);
4010 if (x!=Py_None)
4011 break;
4012 ++collend;
4013 }
4014 /* cache callback name lookup
4015 * (if not done yet, i.e. it's the first error) */
4016 if (known_errorHandler==-1) {
4017 if ((errors==NULL) || (!strcmp(errors, "strict")))
4018 known_errorHandler = 1;
4019 else if (!strcmp(errors, "replace"))
4020 known_errorHandler = 2;
4021 else if (!strcmp(errors, "ignore"))
4022 known_errorHandler = 3;
4023 else if (!strcmp(errors, "xmlcharrefreplace"))
4024 known_errorHandler = 4;
4025 else
4026 known_errorHandler = 0;
4027 }
4028 switch (known_errorHandler) {
4029 case 1: /* strict */
4030 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4031 goto onError;
4032 case 2: /* replace */
4033 /* No need to check for space, this is a 1:1 replacement */
4034 for (coll = collstart; coll<collend; ++coll)
4035 *str++ = '?';
4036 /* fall through */
4037 case 3: /* ignore */
4038 p = collend;
4039 break;
4040 case 4: /* xmlcharrefreplace */
4041 /* generate replacement (temporarily (mis)uses p) */
4042 for (p = collstart; p < collend; ++p) {
4043 char buffer[2+29+1+1];
4044 char *cp;
4045 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004046 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4048 goto onError;
4049 for (cp = buffer; *cp; ++cp)
4050 *str++ = *cp;
4051 }
4052 p = collend;
4053 break;
4054 default:
4055 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4056 reason, startp, size, &exc,
4057 collstart-startp, collend-startp, &newpos);
4058 if (repunicode == NULL)
4059 goto onError;
4060 /* generate replacement */
4061 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004062 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4064 Py_DECREF(repunicode);
4065 goto onError;
4066 }
4067 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4068 *str++ = *uni2;
4069 p = startp + newpos;
4070 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071 }
4072 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004074 /* Resize if we allocated to much */
4075 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004076 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004077 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004078 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079 }
4080 Py_XDECREF(exc);
4081 Py_XDECREF(errorHandler);
4082 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004084 onError:
4085 Py_XDECREF(res);
4086 Py_XDECREF(exc);
4087 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004088 return NULL;
4089}
4090
4091PyObject *PyUnicode_Translate(PyObject *str,
4092 PyObject *mapping,
4093 const char *errors)
4094{
4095 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004096
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097 str = PyUnicode_FromObject(str);
4098 if (str == NULL)
4099 goto onError;
4100 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4101 PyUnicode_GET_SIZE(str),
4102 mapping,
4103 errors);
4104 Py_DECREF(str);
4105 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004106
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107 onError:
4108 Py_XDECREF(str);
4109 return NULL;
4110}
Tim Petersced69f82003-09-16 20:30:58 +00004111
Guido van Rossum9e896b32000-04-05 20:11:21 +00004112/* --- Decimal Encoder ---------------------------------------------------- */
4113
4114int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004115 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004116 char *output,
4117 const char *errors)
4118{
4119 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120 PyObject *errorHandler = NULL;
4121 PyObject *exc = NULL;
4122 const char *encoding = "decimal";
4123 const char *reason = "invalid decimal Unicode string";
4124 /* the following variable is used for caching string comparisons
4125 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4126 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004127
4128 if (output == NULL) {
4129 PyErr_BadArgument();
4130 return -1;
4131 }
4132
4133 p = s;
4134 end = s + length;
4135 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004136 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004137 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004139 Py_ssize_t repsize;
4140 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141 Py_UNICODE *uni2;
4142 Py_UNICODE *collstart;
4143 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004144
Guido van Rossum9e896b32000-04-05 20:11:21 +00004145 if (Py_UNICODE_ISSPACE(ch)) {
4146 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004147 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004148 continue;
4149 }
4150 decimal = Py_UNICODE_TODECIMAL(ch);
4151 if (decimal >= 0) {
4152 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004153 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004154 continue;
4155 }
Guido van Rossumba477042000-04-06 18:18:10 +00004156 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004157 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004159 continue;
4160 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161 /* All other characters are considered unencodable */
4162 collstart = p;
4163 collend = p+1;
4164 while (collend < end) {
4165 if ((0 < *collend && *collend < 256) ||
4166 !Py_UNICODE_ISSPACE(*collend) ||
4167 Py_UNICODE_TODECIMAL(*collend))
4168 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004169 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170 /* cache callback name lookup
4171 * (if not done yet, i.e. it's the first error) */
4172 if (known_errorHandler==-1) {
4173 if ((errors==NULL) || (!strcmp(errors, "strict")))
4174 known_errorHandler = 1;
4175 else if (!strcmp(errors, "replace"))
4176 known_errorHandler = 2;
4177 else if (!strcmp(errors, "ignore"))
4178 known_errorHandler = 3;
4179 else if (!strcmp(errors, "xmlcharrefreplace"))
4180 known_errorHandler = 4;
4181 else
4182 known_errorHandler = 0;
4183 }
4184 switch (known_errorHandler) {
4185 case 1: /* strict */
4186 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4187 goto onError;
4188 case 2: /* replace */
4189 for (p = collstart; p < collend; ++p)
4190 *output++ = '?';
4191 /* fall through */
4192 case 3: /* ignore */
4193 p = collend;
4194 break;
4195 case 4: /* xmlcharrefreplace */
4196 /* generate replacement (temporarily (mis)uses p) */
4197 for (p = collstart; p < collend; ++p)
4198 output += sprintf(output, "&#%d;", (int)*p);
4199 p = collend;
4200 break;
4201 default:
4202 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4203 encoding, reason, s, length, &exc,
4204 collstart-s, collend-s, &newpos);
4205 if (repunicode == NULL)
4206 goto onError;
4207 /* generate replacement */
4208 repsize = PyUnicode_GET_SIZE(repunicode);
4209 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4210 Py_UNICODE ch = *uni2;
4211 if (Py_UNICODE_ISSPACE(ch))
4212 *output++ = ' ';
4213 else {
4214 decimal = Py_UNICODE_TODECIMAL(ch);
4215 if (decimal >= 0)
4216 *output++ = '0' + decimal;
4217 else if (0 < ch && ch < 256)
4218 *output++ = (char)ch;
4219 else {
4220 Py_DECREF(repunicode);
4221 raise_encode_exception(&exc, encoding,
4222 s, length, collstart-s, collend-s, reason);
4223 goto onError;
4224 }
4225 }
4226 }
4227 p = s + newpos;
4228 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004229 }
4230 }
4231 /* 0-terminate the output string */
4232 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004233 Py_XDECREF(exc);
4234 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004235 return 0;
4236
4237 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004238 Py_XDECREF(exc);
4239 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004240 return -1;
4241}
4242
Guido van Rossumd57fd912000-03-10 22:53:23 +00004243/* --- Helpers ------------------------------------------------------------ */
4244
Fredrik Lundha50d2012006-05-26 17:04:58 +00004245#define STRINGLIB_CHAR Py_UNICODE
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004246
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004247#define STRINGLIB_LEN PyUnicode_GET_SIZE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004248#define STRINGLIB_NEW PyUnicode_FromUnicode
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004249#define STRINGLIB_STR PyUnicode_AS_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004250
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004251Py_LOCAL_INLINE(int)
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004252STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4253{
Fredrik Lundh9c0e9c02006-05-26 18:24:15 +00004254 if (str[0] != other[0])
4255 return 1;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004256 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4257}
4258
Fredrik Lundhb9479482006-05-26 17:22:38 +00004259#define STRINGLIB_EMPTY unicode_empty
4260
Fredrik Lundha50d2012006-05-26 17:04:58 +00004261#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004262
4263#include "stringlib/count.h"
4264#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00004265#include "stringlib/partition.h"
4266
Fredrik Lundhc8162812006-05-26 19:33:03 +00004267/* helper macro to fixup start/end slice values */
4268#define FIX_START_END(obj) \
4269 if (start < 0) \
4270 start += (obj)->length; \
4271 if (start < 0) \
4272 start = 0; \
4273 if (end > (obj)->length) \
4274 end = (obj)->length; \
4275 if (end < 0) \
4276 end += (obj)->length; \
4277 if (end < 0) \
4278 end = 0;
4279
Martin v. Löwis18e16552006-02-15 17:27:45 +00004280Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004281 PyObject *substr,
4282 Py_ssize_t start,
4283 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004285 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004286 PyUnicodeObject* str_obj;
4287 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004288
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004289 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4290 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004291 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004292 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4293 if (!sub_obj) {
4294 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004295 return -1;
4296 }
Tim Petersced69f82003-09-16 20:30:58 +00004297
Fredrik Lundhc8162812006-05-26 19:33:03 +00004298 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004299
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004300 result = stringlib_count(
4301 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4302 );
4303
4304 Py_DECREF(sub_obj);
4305 Py_DECREF(str_obj);
4306
Guido van Rossumd57fd912000-03-10 22:53:23 +00004307 return result;
4308}
4309
Martin v. Löwis18e16552006-02-15 17:27:45 +00004310Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004311 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004312 Py_ssize_t start,
4313 Py_ssize_t end,
4314 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004316 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004317
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004318 str = PyUnicode_FromObject(str);
4319 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004320 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004321 sub = PyUnicode_FromObject(sub);
4322 if (!sub) {
4323 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004324 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325 }
Tim Petersced69f82003-09-16 20:30:58 +00004326
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004327 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004328 result = stringlib_find_slice(
4329 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4330 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4331 start, end
4332 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004333 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004334 result = stringlib_rfind_slice(
4335 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4336 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4337 start, end
4338 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004339
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004340 Py_DECREF(str);
4341 Py_DECREF(sub);
4342
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343 return result;
4344}
4345
Tim Petersced69f82003-09-16 20:30:58 +00004346static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004347int tailmatch(PyUnicodeObject *self,
4348 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004349 Py_ssize_t start,
4350 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351 int direction)
4352{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353 if (substring->length == 0)
4354 return 1;
4355
Fredrik Lundhc8162812006-05-26 19:33:03 +00004356 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357
4358 end -= substring->length;
4359 if (end < start)
4360 return 0;
4361
4362 if (direction > 0) {
4363 if (Py_UNICODE_MATCH(self, end, substring))
4364 return 1;
4365 } else {
4366 if (Py_UNICODE_MATCH(self, start, substring))
4367 return 1;
4368 }
4369
4370 return 0;
4371}
4372
Martin v. Löwis18e16552006-02-15 17:27:45 +00004373Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004374 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004375 Py_ssize_t start,
4376 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004377 int direction)
4378{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004379 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004380
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381 str = PyUnicode_FromObject(str);
4382 if (str == NULL)
4383 return -1;
4384 substr = PyUnicode_FromObject(substr);
4385 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004386 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387 return -1;
4388 }
Tim Petersced69f82003-09-16 20:30:58 +00004389
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390 result = tailmatch((PyUnicodeObject *)str,
4391 (PyUnicodeObject *)substr,
4392 start, end, direction);
4393 Py_DECREF(str);
4394 Py_DECREF(substr);
4395 return result;
4396}
4397
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398/* Apply fixfct filter to the Unicode object self and return a
4399 reference to the modified object */
4400
Tim Petersced69f82003-09-16 20:30:58 +00004401static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004402PyObject *fixup(PyUnicodeObject *self,
4403 int (*fixfct)(PyUnicodeObject *s))
4404{
4405
4406 PyUnicodeObject *u;
4407
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004408 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409 if (u == NULL)
4410 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004411
4412 Py_UNICODE_COPY(u->str, self->str, self->length);
4413
Tim Peters7a29bd52001-09-12 03:03:31 +00004414 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415 /* fixfct should return TRUE if it modified the buffer. If
4416 FALSE, return a reference to the original buffer instead
4417 (to save space, not time) */
4418 Py_INCREF(self);
4419 Py_DECREF(u);
4420 return (PyObject*) self;
4421 }
4422 return (PyObject*) u;
4423}
4424
Tim Petersced69f82003-09-16 20:30:58 +00004425static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426int fixupper(PyUnicodeObject *self)
4427{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004428 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429 Py_UNICODE *s = self->str;
4430 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004431
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432 while (len-- > 0) {
4433 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004434
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435 ch = Py_UNICODE_TOUPPER(*s);
4436 if (ch != *s) {
4437 status = 1;
4438 *s = ch;
4439 }
4440 s++;
4441 }
4442
4443 return status;
4444}
4445
Tim Petersced69f82003-09-16 20:30:58 +00004446static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447int fixlower(PyUnicodeObject *self)
4448{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004449 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 Py_UNICODE *s = self->str;
4451 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004452
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453 while (len-- > 0) {
4454 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004455
Guido van Rossumd57fd912000-03-10 22:53:23 +00004456 ch = Py_UNICODE_TOLOWER(*s);
4457 if (ch != *s) {
4458 status = 1;
4459 *s = ch;
4460 }
4461 s++;
4462 }
4463
4464 return status;
4465}
4466
Tim Petersced69f82003-09-16 20:30:58 +00004467static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468int fixswapcase(PyUnicodeObject *self)
4469{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004470 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 Py_UNICODE *s = self->str;
4472 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004473
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474 while (len-- > 0) {
4475 if (Py_UNICODE_ISUPPER(*s)) {
4476 *s = Py_UNICODE_TOLOWER(*s);
4477 status = 1;
4478 } else if (Py_UNICODE_ISLOWER(*s)) {
4479 *s = Py_UNICODE_TOUPPER(*s);
4480 status = 1;
4481 }
4482 s++;
4483 }
4484
4485 return status;
4486}
4487
Tim Petersced69f82003-09-16 20:30:58 +00004488static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489int fixcapitalize(PyUnicodeObject *self)
4490{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004491 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004492 Py_UNICODE *s = self->str;
4493 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004494
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004495 if (len == 0)
4496 return 0;
4497 if (Py_UNICODE_ISLOWER(*s)) {
4498 *s = Py_UNICODE_TOUPPER(*s);
4499 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004501 s++;
4502 while (--len > 0) {
4503 if (Py_UNICODE_ISUPPER(*s)) {
4504 *s = Py_UNICODE_TOLOWER(*s);
4505 status = 1;
4506 }
4507 s++;
4508 }
4509 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510}
4511
4512static
4513int fixtitle(PyUnicodeObject *self)
4514{
4515 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4516 register Py_UNICODE *e;
4517 int previous_is_cased;
4518
4519 /* Shortcut for single character strings */
4520 if (PyUnicode_GET_SIZE(self) == 1) {
4521 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4522 if (*p != ch) {
4523 *p = ch;
4524 return 1;
4525 }
4526 else
4527 return 0;
4528 }
Tim Petersced69f82003-09-16 20:30:58 +00004529
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530 e = p + PyUnicode_GET_SIZE(self);
4531 previous_is_cased = 0;
4532 for (; p < e; p++) {
4533 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004534
Guido van Rossumd57fd912000-03-10 22:53:23 +00004535 if (previous_is_cased)
4536 *p = Py_UNICODE_TOLOWER(ch);
4537 else
4538 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004539
4540 if (Py_UNICODE_ISLOWER(ch) ||
4541 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004542 Py_UNICODE_ISTITLE(ch))
4543 previous_is_cased = 1;
4544 else
4545 previous_is_cased = 0;
4546 }
4547 return 1;
4548}
4549
Tim Peters8ce9f162004-08-27 01:49:32 +00004550PyObject *
4551PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004552{
Tim Peters8ce9f162004-08-27 01:49:32 +00004553 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004554 const Py_UNICODE blank = ' ';
4555 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004556 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004557 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004558 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4559 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004560 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4561 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004562 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004563 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004564 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004565
Tim Peters05eba1f2004-08-27 21:32:02 +00004566 fseq = PySequence_Fast(seq, "");
4567 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004568 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004569 }
4570
Tim Peters91879ab2004-08-27 22:35:44 +00004571 /* Grrrr. A codec may be invoked to convert str objects to
4572 * Unicode, and so it's possible to call back into Python code
4573 * during PyUnicode_FromObject(), and so it's possible for a sick
4574 * codec to change the size of fseq (if seq is a list). Therefore
4575 * we have to keep refetching the size -- can't assume seqlen
4576 * is invariant.
4577 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004578 seqlen = PySequence_Fast_GET_SIZE(fseq);
4579 /* If empty sequence, return u"". */
4580 if (seqlen == 0) {
4581 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4582 goto Done;
4583 }
4584 /* If singleton sequence with an exact Unicode, return that. */
4585 if (seqlen == 1) {
4586 item = PySequence_Fast_GET_ITEM(fseq, 0);
4587 if (PyUnicode_CheckExact(item)) {
4588 Py_INCREF(item);
4589 res = (PyUnicodeObject *)item;
4590 goto Done;
4591 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004592 }
4593
Tim Peters05eba1f2004-08-27 21:32:02 +00004594 /* At least two items to join, or one that isn't exact Unicode. */
4595 if (seqlen > 1) {
4596 /* Set up sep and seplen -- they're needed. */
4597 if (separator == NULL) {
4598 sep = &blank;
4599 seplen = 1;
4600 }
4601 else {
4602 internal_separator = PyUnicode_FromObject(separator);
4603 if (internal_separator == NULL)
4604 goto onError;
4605 sep = PyUnicode_AS_UNICODE(internal_separator);
4606 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004607 /* In case PyUnicode_FromObject() mutated seq. */
4608 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004609 }
4610 }
4611
4612 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004613 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004614 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004615 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004616 res_p = PyUnicode_AS_UNICODE(res);
4617 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004618
Tim Peters05eba1f2004-08-27 21:32:02 +00004619 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004620 Py_ssize_t itemlen;
4621 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004622
4623 item = PySequence_Fast_GET_ITEM(fseq, i);
4624 /* Convert item to Unicode. */
4625 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4626 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004627 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004628 " %.80s found",
4629 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004630 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004631 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004632 item = PyUnicode_FromObject(item);
4633 if (item == NULL)
4634 goto onError;
4635 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004636
Tim Peters91879ab2004-08-27 22:35:44 +00004637 /* In case PyUnicode_FromObject() mutated seq. */
4638 seqlen = PySequence_Fast_GET_SIZE(fseq);
4639
Tim Peters8ce9f162004-08-27 01:49:32 +00004640 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004641 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004642 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004643 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004644 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004645 if (i < seqlen - 1) {
4646 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004647 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004648 goto Overflow;
4649 }
4650 if (new_res_used > res_alloc) {
4651 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004652 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004653 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004654 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004655 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004656 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004657 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004658 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004659 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004660 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004661 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004662 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004663
4664 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004665 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004666 res_p += itemlen;
4667 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004668 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004669 res_p += seplen;
4670 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004672 res_used = new_res_used;
4673 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004674
Tim Peters05eba1f2004-08-27 21:32:02 +00004675 /* Shrink res to match the used area; this probably can't fail,
4676 * but it's cheap to check.
4677 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004678 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004679 goto onError;
4680
4681 Done:
4682 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004683 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684 return (PyObject *)res;
4685
Tim Peters8ce9f162004-08-27 01:49:32 +00004686 Overflow:
4687 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00004688 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004689 Py_DECREF(item);
4690 /* fall through */
4691
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004693 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004694 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004695 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696 return NULL;
4697}
4698
Tim Petersced69f82003-09-16 20:30:58 +00004699static
4700PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004701 Py_ssize_t left,
4702 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703 Py_UNICODE fill)
4704{
4705 PyUnicodeObject *u;
4706
4707 if (left < 0)
4708 left = 0;
4709 if (right < 0)
4710 right = 0;
4711
Tim Peters7a29bd52001-09-12 03:03:31 +00004712 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713 Py_INCREF(self);
4714 return self;
4715 }
4716
4717 u = _PyUnicode_New(left + self->length + right);
4718 if (u) {
4719 if (left)
4720 Py_UNICODE_FILL(u->str, fill, left);
4721 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4722 if (right)
4723 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4724 }
4725
4726 return u;
4727}
4728
4729#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004730 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 if (!str) \
4732 goto onError; \
4733 if (PyList_Append(list, str)) { \
4734 Py_DECREF(str); \
4735 goto onError; \
4736 } \
4737 else \
4738 Py_DECREF(str);
4739
4740static
4741PyObject *split_whitespace(PyUnicodeObject *self,
4742 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004743 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004745 register Py_ssize_t i;
4746 register Py_ssize_t j;
4747 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 PyObject *str;
4749
4750 for (i = j = 0; i < len; ) {
4751 /* find a token */
4752 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4753 i++;
4754 j = i;
4755 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4756 i++;
4757 if (j < i) {
4758 if (maxcount-- <= 0)
4759 break;
4760 SPLIT_APPEND(self->str, j, i);
4761 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4762 i++;
4763 j = i;
4764 }
4765 }
4766 if (j < len) {
4767 SPLIT_APPEND(self->str, j, len);
4768 }
4769 return list;
4770
4771 onError:
4772 Py_DECREF(list);
4773 return NULL;
4774}
4775
4776PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004777 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004779 register Py_ssize_t i;
4780 register Py_ssize_t j;
4781 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782 PyObject *list;
4783 PyObject *str;
4784 Py_UNICODE *data;
4785
4786 string = PyUnicode_FromObject(string);
4787 if (string == NULL)
4788 return NULL;
4789 data = PyUnicode_AS_UNICODE(string);
4790 len = PyUnicode_GET_SIZE(string);
4791
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 list = PyList_New(0);
4793 if (!list)
4794 goto onError;
4795
4796 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004797 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004798
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004800 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802
4803 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004804 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805 if (i < len) {
4806 if (data[i] == '\r' && i + 1 < len &&
4807 data[i+1] == '\n')
4808 i += 2;
4809 else
4810 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004811 if (keepends)
4812 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813 }
Guido van Rossum86662912000-04-11 15:38:46 +00004814 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815 j = i;
4816 }
4817 if (j < len) {
4818 SPLIT_APPEND(data, j, len);
4819 }
4820
4821 Py_DECREF(string);
4822 return list;
4823
4824 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004825 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826 Py_DECREF(string);
4827 return NULL;
4828}
4829
Tim Petersced69f82003-09-16 20:30:58 +00004830static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831PyObject *split_char(PyUnicodeObject *self,
4832 PyObject *list,
4833 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004834 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004836 register Py_ssize_t i;
4837 register Py_ssize_t j;
4838 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839 PyObject *str;
4840
4841 for (i = j = 0; i < len; ) {
4842 if (self->str[i] == ch) {
4843 if (maxcount-- <= 0)
4844 break;
4845 SPLIT_APPEND(self->str, j, i);
4846 i = j = i + 1;
4847 } else
4848 i++;
4849 }
4850 if (j <= len) {
4851 SPLIT_APPEND(self->str, j, len);
4852 }
4853 return list;
4854
4855 onError:
4856 Py_DECREF(list);
4857 return NULL;
4858}
4859
Tim Petersced69f82003-09-16 20:30:58 +00004860static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861PyObject *split_substring(PyUnicodeObject *self,
4862 PyObject *list,
4863 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004864 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004866 register Py_ssize_t i;
4867 register Py_ssize_t j;
4868 Py_ssize_t len = self->length;
4869 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870 PyObject *str;
4871
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004872 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873 if (Py_UNICODE_MATCH(self, i, substring)) {
4874 if (maxcount-- <= 0)
4875 break;
4876 SPLIT_APPEND(self->str, j, i);
4877 i = j = i + sublen;
4878 } else
4879 i++;
4880 }
4881 if (j <= len) {
4882 SPLIT_APPEND(self->str, j, len);
4883 }
4884 return list;
4885
4886 onError:
4887 Py_DECREF(list);
4888 return NULL;
4889}
4890
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004891static
4892PyObject *rsplit_whitespace(PyUnicodeObject *self,
4893 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004894 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004895{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004896 register Py_ssize_t i;
4897 register Py_ssize_t j;
4898 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004899 PyObject *str;
4900
4901 for (i = j = len - 1; i >= 0; ) {
4902 /* find a token */
4903 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4904 i--;
4905 j = i;
4906 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4907 i--;
4908 if (j > i) {
4909 if (maxcount-- <= 0)
4910 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004911 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004912 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4913 i--;
4914 j = i;
4915 }
4916 }
4917 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004918 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004919 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004920 if (PyList_Reverse(list) < 0)
4921 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004922 return list;
4923
4924 onError:
4925 Py_DECREF(list);
4926 return NULL;
4927}
4928
4929static
4930PyObject *rsplit_char(PyUnicodeObject *self,
4931 PyObject *list,
4932 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004933 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004934{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004935 register Py_ssize_t i;
4936 register Py_ssize_t j;
4937 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004938 PyObject *str;
4939
4940 for (i = j = len - 1; i >= 0; ) {
4941 if (self->str[i] == ch) {
4942 if (maxcount-- <= 0)
4943 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004944 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004945 j = i = i - 1;
4946 } else
4947 i--;
4948 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004949 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004950 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004951 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004952 if (PyList_Reverse(list) < 0)
4953 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004954 return list;
4955
4956 onError:
4957 Py_DECREF(list);
4958 return NULL;
4959}
4960
4961static
4962PyObject *rsplit_substring(PyUnicodeObject *self,
4963 PyObject *list,
4964 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004965 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004966{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004967 register Py_ssize_t i;
4968 register Py_ssize_t j;
4969 Py_ssize_t len = self->length;
4970 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004971 PyObject *str;
4972
4973 for (i = len - sublen, j = len; i >= 0; ) {
4974 if (Py_UNICODE_MATCH(self, i, substring)) {
4975 if (maxcount-- <= 0)
4976 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004977 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004978 j = i;
4979 i -= sublen;
4980 } else
4981 i--;
4982 }
4983 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004984 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004985 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004986 if (PyList_Reverse(list) < 0)
4987 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004988 return list;
4989
4990 onError:
4991 Py_DECREF(list);
4992 return NULL;
4993}
4994
Guido van Rossumd57fd912000-03-10 22:53:23 +00004995#undef SPLIT_APPEND
4996
4997static
4998PyObject *split(PyUnicodeObject *self,
4999 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005000 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001{
5002 PyObject *list;
5003
5004 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005005 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006
5007 list = PyList_New(0);
5008 if (!list)
5009 return NULL;
5010
5011 if (substring == NULL)
5012 return split_whitespace(self,list,maxcount);
5013
5014 else if (substring->length == 1)
5015 return split_char(self,list,substring->str[0],maxcount);
5016
5017 else if (substring->length == 0) {
5018 Py_DECREF(list);
5019 PyErr_SetString(PyExc_ValueError, "empty separator");
5020 return NULL;
5021 }
5022 else
5023 return split_substring(self,list,substring,maxcount);
5024}
5025
Tim Petersced69f82003-09-16 20:30:58 +00005026static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005027PyObject *rsplit(PyUnicodeObject *self,
5028 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005029 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005030{
5031 PyObject *list;
5032
5033 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005034 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005035
5036 list = PyList_New(0);
5037 if (!list)
5038 return NULL;
5039
5040 if (substring == NULL)
5041 return rsplit_whitespace(self,list,maxcount);
5042
5043 else if (substring->length == 1)
5044 return rsplit_char(self,list,substring->str[0],maxcount);
5045
5046 else if (substring->length == 0) {
5047 Py_DECREF(list);
5048 PyErr_SetString(PyExc_ValueError, "empty separator");
5049 return NULL;
5050 }
5051 else
5052 return rsplit_substring(self,list,substring,maxcount);
5053}
5054
5055static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005056PyObject *replace(PyUnicodeObject *self,
5057 PyUnicodeObject *str1,
5058 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005059 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060{
5061 PyUnicodeObject *u;
5062
5063 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005064 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065
Fredrik Lundh347ee272006-05-24 16:35:18 +00005066 if (str1->length == str2->length) {
5067 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005068 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005069 if (str1->length == 1) {
5070 /* replace characters */
5071 Py_UNICODE u1, u2;
5072 if (!findchar(self->str, self->length, str1->str[0]))
5073 goto nothing;
5074 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5075 if (!u)
5076 return NULL;
5077 Py_UNICODE_COPY(u->str, self->str, self->length);
5078 u1 = str1->str[0];
5079 u2 = str2->str[0];
5080 for (i = 0; i < u->length; i++)
5081 if (u->str[i] == u1) {
5082 if (--maxcount < 0)
5083 break;
5084 u->str[i] = u2;
5085 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005087 i = fastsearch(
5088 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005090 if (i < 0)
5091 goto nothing;
5092 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5093 if (!u)
5094 return NULL;
5095 Py_UNICODE_COPY(u->str, self->str, self->length);
5096 while (i <= self->length - str1->length)
5097 if (Py_UNICODE_MATCH(self, i, str1)) {
5098 if (--maxcount < 0)
5099 break;
5100 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5101 i += str1->length;
5102 } else
5103 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005106
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005107 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005108 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109 Py_UNICODE *p;
5110
5111 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005112 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005113 if (n > maxcount)
5114 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005115 if (n == 0)
5116 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005117 /* new_size = self->length + n * (str2->length - str1->length)); */
5118 delta = (str2->length - str1->length);
5119 if (delta == 0) {
5120 new_size = self->length;
5121 } else {
5122 product = n * (str2->length - str1->length);
5123 if ((product / (str2->length - str1->length)) != n) {
5124 PyErr_SetString(PyExc_OverflowError,
5125 "replace string is too long");
5126 return NULL;
5127 }
5128 new_size = self->length + product;
5129 if (new_size < 0) {
5130 PyErr_SetString(PyExc_OverflowError,
5131 "replace string is too long");
5132 return NULL;
5133 }
5134 }
5135 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005136 if (!u)
5137 return NULL;
5138 i = 0;
5139 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005140 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005141 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005142 while (n-- > 0) {
5143 /* look for next match */
5144 j = i;
5145 while (j <= e) {
5146 if (Py_UNICODE_MATCH(self, j, str1))
5147 break;
5148 j++;
5149 }
5150 if (j > i) {
5151 if (j > e)
5152 break;
5153 /* copy unchanged part [i:j] */
5154 Py_UNICODE_COPY(p, self->str+i, j-i);
5155 p += j - i;
5156 }
5157 /* copy substitution string */
5158 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005159 Py_UNICODE_COPY(p, str2->str, str2->length);
5160 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005161 }
5162 i = j + str1->length;
5163 }
5164 if (i < self->length)
5165 /* copy tail [i:] */
5166 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005167 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005168 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005169 while (n > 0) {
5170 Py_UNICODE_COPY(p, str2->str, str2->length);
5171 p += str2->length;
5172 if (--n <= 0)
5173 break;
5174 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005176 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177 }
5178 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005180
5181nothing:
5182 /* nothing to replace; return original string (when possible) */
5183 if (PyUnicode_CheckExact(self)) {
5184 Py_INCREF(self);
5185 return (PyObject *) self;
5186 }
5187 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188}
5189
5190/* --- Unicode Object Methods --------------------------------------------- */
5191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005192PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193"S.title() -> unicode\n\
5194\n\
5195Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005196characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197
5198static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005199unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 return fixup(self, fixtitle);
5202}
5203
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005204PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205"S.capitalize() -> unicode\n\
5206\n\
5207Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005208have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209
5210static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005211unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213 return fixup(self, fixcapitalize);
5214}
5215
5216#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005217PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218"S.capwords() -> unicode\n\
5219\n\
5220Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005221normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222
5223static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005224unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225{
5226 PyObject *list;
5227 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005228 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230 /* Split into words */
5231 list = split(self, NULL, -1);
5232 if (!list)
5233 return NULL;
5234
5235 /* Capitalize each word */
5236 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5237 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5238 fixcapitalize);
5239 if (item == NULL)
5240 goto onError;
5241 Py_DECREF(PyList_GET_ITEM(list, i));
5242 PyList_SET_ITEM(list, i, item);
5243 }
5244
5245 /* Join the words to form a new string */
5246 item = PyUnicode_Join(NULL, list);
5247
5248onError:
5249 Py_DECREF(list);
5250 return (PyObject *)item;
5251}
5252#endif
5253
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005254/* Argument converter. Coerces to a single unicode character */
5255
5256static int
5257convert_uc(PyObject *obj, void *addr)
5258{
5259 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5260 PyObject *uniobj;
5261 Py_UNICODE *unistr;
5262
5263 uniobj = PyUnicode_FromObject(obj);
5264 if (uniobj == NULL) {
5265 PyErr_SetString(PyExc_TypeError,
5266 "The fill character cannot be converted to Unicode");
5267 return 0;
5268 }
5269 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5270 PyErr_SetString(PyExc_TypeError,
5271 "The fill character must be exactly one character long");
5272 Py_DECREF(uniobj);
5273 return 0;
5274 }
5275 unistr = PyUnicode_AS_UNICODE(uniobj);
5276 *fillcharloc = unistr[0];
5277 Py_DECREF(uniobj);
5278 return 1;
5279}
5280
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005281PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005282"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005284Return S centered in a Unicode string of length width. Padding is\n\
5285done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286
5287static PyObject *
5288unicode_center(PyUnicodeObject *self, PyObject *args)
5289{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005290 Py_ssize_t marg, left;
5291 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005292 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005293
Thomas Woutersde017742006-02-16 19:34:37 +00005294 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 return NULL;
5296
Tim Peters7a29bd52001-09-12 03:03:31 +00005297 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 Py_INCREF(self);
5299 return (PyObject*) self;
5300 }
5301
5302 marg = width - self->length;
5303 left = marg / 2 + (marg & width & 1);
5304
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005305 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306}
5307
Marc-André Lemburge5034372000-08-08 08:04:29 +00005308#if 0
5309
5310/* This code should go into some future Unicode collation support
5311 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005312 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005313
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005314/* speedy UTF-16 code point order comparison */
5315/* gleaned from: */
5316/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5317
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005318static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005319{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005320 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005321 0, 0, 0, 0, 0, 0, 0, 0,
5322 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005323 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005324};
5325
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326static int
5327unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5328{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005329 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005330
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 Py_UNICODE *s1 = str1->str;
5332 Py_UNICODE *s2 = str2->str;
5333
5334 len1 = str1->length;
5335 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005336
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005338 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005339
5340 c1 = *s1++;
5341 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005342
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005343 if (c1 > (1<<11) * 26)
5344 c1 += utf16Fixup[c1>>11];
5345 if (c2 > (1<<11) * 26)
5346 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005347 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005348
5349 if (c1 != c2)
5350 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005351
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005352 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353 }
5354
5355 return (len1 < len2) ? -1 : (len1 != len2);
5356}
5357
Marc-André Lemburge5034372000-08-08 08:04:29 +00005358#else
5359
5360static int
5361unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5362{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005363 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005364
5365 Py_UNICODE *s1 = str1->str;
5366 Py_UNICODE *s2 = str2->str;
5367
5368 len1 = str1->length;
5369 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005370
Marc-André Lemburge5034372000-08-08 08:04:29 +00005371 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005372 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005373
Fredrik Lundh45714e92001-06-26 16:39:36 +00005374 c1 = *s1++;
5375 c2 = *s2++;
5376
5377 if (c1 != c2)
5378 return (c1 < c2) ? -1 : 1;
5379
Marc-André Lemburge5034372000-08-08 08:04:29 +00005380 len1--; len2--;
5381 }
5382
5383 return (len1 < len2) ? -1 : (len1 != len2);
5384}
5385
5386#endif
5387
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388int PyUnicode_Compare(PyObject *left,
5389 PyObject *right)
5390{
5391 PyUnicodeObject *u = NULL, *v = NULL;
5392 int result;
5393
5394 /* Coerce the two arguments */
5395 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5396 if (u == NULL)
5397 goto onError;
5398 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5399 if (v == NULL)
5400 goto onError;
5401
Thomas Wouters7e474022000-07-16 12:04:32 +00005402 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 if (v == u) {
5404 Py_DECREF(u);
5405 Py_DECREF(v);
5406 return 0;
5407 }
5408
5409 result = unicode_compare(u, v);
5410
5411 Py_DECREF(u);
5412 Py_DECREF(v);
5413 return result;
5414
5415onError:
5416 Py_XDECREF(u);
5417 Py_XDECREF(v);
5418 return -1;
5419}
5420
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00005421PyObject *PyUnicode_RichCompare(PyObject *left,
5422 PyObject *right,
5423 int op)
5424{
5425 int result;
5426
5427 result = PyUnicode_Compare(left, right);
5428 if (result == -1 && PyErr_Occurred())
5429 goto onError;
5430
5431 /* Convert the return value to a Boolean */
5432 switch (op) {
5433 case Py_EQ:
5434 result = (result == 0);
5435 break;
5436 case Py_NE:
5437 result = (result != 0);
5438 break;
5439 case Py_LE:
5440 result = (result <= 0);
5441 break;
5442 case Py_GE:
5443 result = (result >= 0);
5444 break;
5445 case Py_LT:
5446 result = (result == -1);
5447 break;
5448 case Py_GT:
5449 result = (result == 1);
5450 break;
5451 }
5452 return PyBool_FromLong(result);
5453
5454 onError:
5455
5456 /* Standard case
5457
5458 Type errors mean that PyUnicode_FromObject() could not convert
5459 one of the arguments (usually the right hand side) to Unicode,
5460 ie. we can't handle the comparison request. However, it is
5461 possible that the other object knows a comparison method, which
5462 is why we return Py_NotImplemented to give the other object a
5463 chance.
5464
5465 */
5466 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5467 PyErr_Clear();
5468 Py_INCREF(Py_NotImplemented);
5469 return Py_NotImplemented;
5470 }
5471 if (op != Py_EQ && op != Py_NE)
5472 return NULL;
5473
5474 /* Equality comparison.
5475
5476 This is a special case: we silence any PyExc_UnicodeDecodeError
5477 and instead turn it into a PyErr_UnicodeWarning.
5478
5479 */
5480 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5481 return NULL;
5482 PyErr_Clear();
5483 if (PyErr_Warn(PyExc_UnicodeWarning,
5484 (op == Py_EQ) ?
5485 "Unicode equal comparison "
5486 "failed to convert both arguments to Unicode - "
5487 "interpreting them as being unequal" :
5488 "Unicode unequal comparison "
5489 "failed to convert both arguments to Unicode - "
5490 "interpreting them as being unequal"
5491 ) < 0)
5492 return NULL;
5493 result = (op == Py_NE);
5494 return PyBool_FromLong(result);
5495}
5496
Guido van Rossum403d68b2000-03-13 15:55:09 +00005497int PyUnicode_Contains(PyObject *container,
5498 PyObject *element)
5499{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005500 PyObject *str, *sub;
5501 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005502
5503 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005504 sub = PyUnicode_FromObject(element);
5505 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005506 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005507 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005508 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005509 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005510
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005511 str = PyUnicode_FromObject(container);
5512 if (!str) {
5513 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005514 return -1;
5515 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005516
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005517 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00005518
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005519 Py_DECREF(str);
5520 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00005521
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005522 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005523}
5524
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525/* Concat to string or Unicode object giving a new Unicode object. */
5526
5527PyObject *PyUnicode_Concat(PyObject *left,
5528 PyObject *right)
5529{
5530 PyUnicodeObject *u = NULL, *v = NULL, *w;
5531
5532 /* Coerce the two arguments */
5533 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5534 if (u == NULL)
5535 goto onError;
5536 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5537 if (v == NULL)
5538 goto onError;
5539
5540 /* Shortcuts */
5541 if (v == unicode_empty) {
5542 Py_DECREF(v);
5543 return (PyObject *)u;
5544 }
5545 if (u == unicode_empty) {
5546 Py_DECREF(u);
5547 return (PyObject *)v;
5548 }
5549
5550 /* Concat the two Unicode strings */
5551 w = _PyUnicode_New(u->length + v->length);
5552 if (w == NULL)
5553 goto onError;
5554 Py_UNICODE_COPY(w->str, u->str, u->length);
5555 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5556
5557 Py_DECREF(u);
5558 Py_DECREF(v);
5559 return (PyObject *)w;
5560
5561onError:
5562 Py_XDECREF(u);
5563 Py_XDECREF(v);
5564 return NULL;
5565}
5566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005567PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568"S.count(sub[, start[, end]]) -> int\n\
5569\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005570Return the number of non-overlapping occurrences of substring sub in\n\
5571Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005572interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573
5574static PyObject *
5575unicode_count(PyUnicodeObject *self, PyObject *args)
5576{
5577 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005578 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005579 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 PyObject *result;
5581
Guido van Rossumb8872e62000-05-09 14:14:27 +00005582 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5583 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 return NULL;
5585
5586 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005587 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 if (substring == NULL)
5589 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005590
Fredrik Lundhc8162812006-05-26 19:33:03 +00005591 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005593 result = PyInt_FromSsize_t(
5594 stringlib_count(self->str + start, end - start,
5595 substring->str, substring->length)
5596 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597
5598 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005599
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600 return result;
5601}
5602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005603PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005604"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005606Encodes S using the codec registered for encoding. encoding defaults\n\
5607to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005608handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005609a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5610'xmlcharrefreplace' as well as any other name registered with\n\
5611codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612
5613static PyObject *
5614unicode_encode(PyUnicodeObject *self, PyObject *args)
5615{
5616 char *encoding = NULL;
5617 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005618 PyObject *v;
5619
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5621 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005622 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005623 if (v == NULL)
5624 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005625 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5626 PyErr_Format(PyExc_TypeError,
5627 "encoder did not return a string/unicode object "
5628 "(type=%.400s)",
5629 v->ob_type->tp_name);
5630 Py_DECREF(v);
5631 return NULL;
5632 }
5633 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005634
5635 onError:
5636 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005637}
5638
5639PyDoc_STRVAR(decode__doc__,
5640"S.decode([encoding[,errors]]) -> string or unicode\n\
5641\n\
5642Decodes S using the codec registered for encoding. encoding defaults\n\
5643to the default encoding. errors may be given to set a different error\n\
5644handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5645a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5646as well as any other name registerd with codecs.register_error that is\n\
5647able to handle UnicodeDecodeErrors.");
5648
5649static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005650unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005651{
5652 char *encoding = NULL;
5653 char *errors = NULL;
5654 PyObject *v;
5655
5656 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5657 return NULL;
5658 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005659 if (v == NULL)
5660 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005661 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5662 PyErr_Format(PyExc_TypeError,
5663 "decoder did not return a string/unicode object "
5664 "(type=%.400s)",
5665 v->ob_type->tp_name);
5666 Py_DECREF(v);
5667 return NULL;
5668 }
5669 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005670
5671 onError:
5672 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673}
5674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005675PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676"S.expandtabs([tabsize]) -> unicode\n\
5677\n\
5678Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005679If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680
5681static PyObject*
5682unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5683{
5684 Py_UNICODE *e;
5685 Py_UNICODE *p;
5686 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005687 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 PyUnicodeObject *u;
5689 int tabsize = 8;
5690
5691 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5692 return NULL;
5693
Thomas Wouters7e474022000-07-16 12:04:32 +00005694 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 i = j = 0;
5696 e = self->str + self->length;
5697 for (p = self->str; p < e; p++)
5698 if (*p == '\t') {
5699 if (tabsize > 0)
5700 j += tabsize - (j % tabsize);
5701 }
5702 else {
5703 j++;
5704 if (*p == '\n' || *p == '\r') {
5705 i += j;
5706 j = 0;
5707 }
5708 }
5709
5710 /* Second pass: create output string and fill it */
5711 u = _PyUnicode_New(i + j);
5712 if (!u)
5713 return NULL;
5714
5715 j = 0;
5716 q = u->str;
5717
5718 for (p = self->str; p < e; p++)
5719 if (*p == '\t') {
5720 if (tabsize > 0) {
5721 i = tabsize - (j % tabsize);
5722 j += i;
5723 while (i--)
5724 *q++ = ' ';
5725 }
5726 }
5727 else {
5728 j++;
5729 *q++ = *p;
5730 if (*p == '\n' || *p == '\r')
5731 j = 0;
5732 }
5733
5734 return (PyObject*) u;
5735}
5736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005737PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738"S.find(sub [,start [,end]]) -> int\n\
5739\n\
5740Return the lowest index in S where substring sub is found,\n\
5741such that sub is contained within s[start,end]. Optional\n\
5742arguments start and end are interpreted as in slice notation.\n\
5743\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005744Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745
5746static PyObject *
5747unicode_find(PyUnicodeObject *self, PyObject *args)
5748{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005749 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005750 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005751 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005752 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753
Guido van Rossumb8872e62000-05-09 14:14:27 +00005754 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5755 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005757 substring = PyUnicode_FromObject(substring);
5758 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 return NULL;
5760
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005761 result = stringlib_find_slice(
5762 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5763 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5764 start, end
5765 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766
5767 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005768
5769 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770}
5771
5772static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005773unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774{
5775 if (index < 0 || index >= self->length) {
5776 PyErr_SetString(PyExc_IndexError, "string index out of range");
5777 return NULL;
5778 }
5779
5780 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5781}
5782
5783static long
5784unicode_hash(PyUnicodeObject *self)
5785{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005786 /* Since Unicode objects compare equal to their ASCII string
5787 counterparts, they should use the individual character values
5788 as basis for their hash value. This is needed to assure that
5789 strings and Unicode objects behave in the same way as
5790 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791
Martin v. Löwis18e16552006-02-15 17:27:45 +00005792 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005793 register Py_UNICODE *p;
5794 register long x;
5795
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796 if (self->hash != -1)
5797 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005798 len = PyUnicode_GET_SIZE(self);
5799 p = PyUnicode_AS_UNICODE(self);
5800 x = *p << 7;
5801 while (--len >= 0)
5802 x = (1000003*x) ^ *p++;
5803 x ^= PyUnicode_GET_SIZE(self);
5804 if (x == -1)
5805 x = -2;
5806 self->hash = x;
5807 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808}
5809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005810PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811"S.index(sub [,start [,end]]) -> int\n\
5812\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005813Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814
5815static PyObject *
5816unicode_index(PyUnicodeObject *self, PyObject *args)
5817{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005818 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005819 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005820 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005821 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822
Guido van Rossumb8872e62000-05-09 14:14:27 +00005823 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5824 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005826 substring = PyUnicode_FromObject(substring);
5827 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828 return NULL;
5829
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005830 result = stringlib_find_slice(
5831 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5832 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5833 start, end
5834 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835
5836 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005837
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838 if (result < 0) {
5839 PyErr_SetString(PyExc_ValueError, "substring not found");
5840 return NULL;
5841 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005842
Martin v. Löwis18e16552006-02-15 17:27:45 +00005843 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844}
5845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005846PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005847"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005849Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005850at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851
5852static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005853unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854{
5855 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5856 register const Py_UNICODE *e;
5857 int cased;
5858
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 /* Shortcut for single character strings */
5860 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005861 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005863 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005864 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005865 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005866
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 e = p + PyUnicode_GET_SIZE(self);
5868 cased = 0;
5869 for (; p < e; p++) {
5870 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005871
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005873 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 else if (!cased && Py_UNICODE_ISLOWER(ch))
5875 cased = 1;
5876 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005877 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878}
5879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005880PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005881"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005883Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005884at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885
5886static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005887unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888{
5889 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5890 register const Py_UNICODE *e;
5891 int cased;
5892
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 /* Shortcut for single character strings */
5894 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005895 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005897 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005898 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005899 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005900
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901 e = p + PyUnicode_GET_SIZE(self);
5902 cased = 0;
5903 for (; p < e; p++) {
5904 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005905
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005907 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 else if (!cased && Py_UNICODE_ISUPPER(ch))
5909 cased = 1;
5910 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005911 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912}
5913
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005914PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005915"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005917Return True if S is a titlecased string and there is at least one\n\
5918character in S, i.e. upper- and titlecase characters may only\n\
5919follow uncased characters and lowercase characters only cased ones.\n\
5920Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921
5922static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005923unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924{
5925 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5926 register const Py_UNICODE *e;
5927 int cased, previous_is_cased;
5928
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 /* Shortcut for single character strings */
5930 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005931 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5932 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005934 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005935 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005936 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005937
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 e = p + PyUnicode_GET_SIZE(self);
5939 cased = 0;
5940 previous_is_cased = 0;
5941 for (; p < e; p++) {
5942 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005943
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5945 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005946 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947 previous_is_cased = 1;
5948 cased = 1;
5949 }
5950 else if (Py_UNICODE_ISLOWER(ch)) {
5951 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005952 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 previous_is_cased = 1;
5954 cased = 1;
5955 }
5956 else
5957 previous_is_cased = 0;
5958 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005959 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960}
5961
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005962PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005963"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005965Return True if all characters in S are whitespace\n\
5966and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967
5968static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005969unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970{
5971 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5972 register const Py_UNICODE *e;
5973
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 /* Shortcut for single character strings */
5975 if (PyUnicode_GET_SIZE(self) == 1 &&
5976 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005977 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005979 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005980 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005981 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005982
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 e = p + PyUnicode_GET_SIZE(self);
5984 for (; p < e; p++) {
5985 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005986 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005988 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989}
5990
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005991PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005992"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005993\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005994Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005995and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005996
5997static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005998unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005999{
6000 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6001 register const Py_UNICODE *e;
6002
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006003 /* Shortcut for single character strings */
6004 if (PyUnicode_GET_SIZE(self) == 1 &&
6005 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006006 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006007
6008 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006009 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006010 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006011
6012 e = p + PyUnicode_GET_SIZE(self);
6013 for (; p < e; p++) {
6014 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006015 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006016 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006017 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006018}
6019
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006020PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006021"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006022\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006023Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006024and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006025
6026static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006027unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006028{
6029 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6030 register const Py_UNICODE *e;
6031
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006032 /* Shortcut for single character strings */
6033 if (PyUnicode_GET_SIZE(self) == 1 &&
6034 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006035 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006036
6037 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006038 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006039 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006040
6041 e = p + PyUnicode_GET_SIZE(self);
6042 for (; p < e; p++) {
6043 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006044 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006045 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006046 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006047}
6048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006049PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006050"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006052Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006053False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054
6055static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006056unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057{
6058 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6059 register const Py_UNICODE *e;
6060
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 /* Shortcut for single character strings */
6062 if (PyUnicode_GET_SIZE(self) == 1 &&
6063 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006064 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006066 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006067 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006068 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006069
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 e = p + PyUnicode_GET_SIZE(self);
6071 for (; p < e; p++) {
6072 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006073 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006075 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076}
6077
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006078PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006079"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006081Return True if all characters in S are digits\n\
6082and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083
6084static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006085unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086{
6087 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6088 register const Py_UNICODE *e;
6089
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 /* Shortcut for single character strings */
6091 if (PyUnicode_GET_SIZE(self) == 1 &&
6092 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006093 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006095 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006096 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006097 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006098
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 e = p + PyUnicode_GET_SIZE(self);
6100 for (; p < e; p++) {
6101 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006102 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006104 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105}
6106
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006107PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006108"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006110Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006111False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112
6113static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006114unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115{
6116 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6117 register const Py_UNICODE *e;
6118
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 /* Shortcut for single character strings */
6120 if (PyUnicode_GET_SIZE(self) == 1 &&
6121 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006122 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006124 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006125 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006126 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006127
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 e = p + PyUnicode_GET_SIZE(self);
6129 for (; p < e; p++) {
6130 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006131 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006133 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134}
6135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006136PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137"S.join(sequence) -> unicode\n\
6138\n\
6139Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006140sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141
6142static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006143unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006145 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146}
6147
Martin v. Löwis18e16552006-02-15 17:27:45 +00006148static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149unicode_length(PyUnicodeObject *self)
6150{
6151 return self->length;
6152}
6153
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006154PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006155"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156\n\
6157Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006158done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159
6160static PyObject *
6161unicode_ljust(PyUnicodeObject *self, PyObject *args)
6162{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006163 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006164 Py_UNICODE fillchar = ' ';
6165
Martin v. Löwis412fb672006-04-13 06:34:32 +00006166 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167 return NULL;
6168
Tim Peters7a29bd52001-09-12 03:03:31 +00006169 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 Py_INCREF(self);
6171 return (PyObject*) self;
6172 }
6173
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006174 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175}
6176
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006177PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178"S.lower() -> unicode\n\
6179\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006180Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181
6182static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006183unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185 return fixup(self, fixlower);
6186}
6187
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006188#define LEFTSTRIP 0
6189#define RIGHTSTRIP 1
6190#define BOTHSTRIP 2
6191
6192/* Arrays indexed by above */
6193static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6194
6195#define STRIPNAME(i) (stripformat[i]+3)
6196
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006197/* externally visible for str.strip(unicode) */
6198PyObject *
6199_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6200{
6201 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006202 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006203 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006204 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6205 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006206
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006207 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6208
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006209 i = 0;
6210 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006211 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6212 i++;
6213 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006214 }
6215
6216 j = len;
6217 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006218 do {
6219 j--;
6220 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6221 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006222 }
6223
6224 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006225 Py_INCREF(self);
6226 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006227 }
6228 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006229 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006230}
6231
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232
6233static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006234do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006236 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006237 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006238
6239 i = 0;
6240 if (striptype != RIGHTSTRIP) {
6241 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6242 i++;
6243 }
6244 }
6245
6246 j = len;
6247 if (striptype != LEFTSTRIP) {
6248 do {
6249 j--;
6250 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6251 j++;
6252 }
6253
6254 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6255 Py_INCREF(self);
6256 return (PyObject*)self;
6257 }
6258 else
6259 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260}
6261
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006262
6263static PyObject *
6264do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6265{
6266 PyObject *sep = NULL;
6267
6268 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6269 return NULL;
6270
6271 if (sep != NULL && sep != Py_None) {
6272 if (PyUnicode_Check(sep))
6273 return _PyUnicode_XStrip(self, striptype, sep);
6274 else if (PyString_Check(sep)) {
6275 PyObject *res;
6276 sep = PyUnicode_FromObject(sep);
6277 if (sep==NULL)
6278 return NULL;
6279 res = _PyUnicode_XStrip(self, striptype, sep);
6280 Py_DECREF(sep);
6281 return res;
6282 }
6283 else {
6284 PyErr_Format(PyExc_TypeError,
6285 "%s arg must be None, unicode or str",
6286 STRIPNAME(striptype));
6287 return NULL;
6288 }
6289 }
6290
6291 return do_strip(self, striptype);
6292}
6293
6294
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006295PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006296"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006297\n\
6298Return a copy of the string S with leading and trailing\n\
6299whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006300If chars is given and not None, remove characters in chars instead.\n\
6301If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006302
6303static PyObject *
6304unicode_strip(PyUnicodeObject *self, PyObject *args)
6305{
6306 if (PyTuple_GET_SIZE(args) == 0)
6307 return do_strip(self, BOTHSTRIP); /* Common case */
6308 else
6309 return do_argstrip(self, BOTHSTRIP, args);
6310}
6311
6312
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006313PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006314"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006315\n\
6316Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006317If chars is given and not None, remove characters in chars instead.\n\
6318If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006319
6320static PyObject *
6321unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6322{
6323 if (PyTuple_GET_SIZE(args) == 0)
6324 return do_strip(self, LEFTSTRIP); /* Common case */
6325 else
6326 return do_argstrip(self, LEFTSTRIP, args);
6327}
6328
6329
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006330PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006331"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006332\n\
6333Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006334If chars is given and not None, remove characters in chars instead.\n\
6335If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006336
6337static PyObject *
6338unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6339{
6340 if (PyTuple_GET_SIZE(args) == 0)
6341 return do_strip(self, RIGHTSTRIP); /* Common case */
6342 else
6343 return do_argstrip(self, RIGHTSTRIP, args);
6344}
6345
6346
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006348unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349{
6350 PyUnicodeObject *u;
6351 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006352 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006353 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354
6355 if (len < 0)
6356 len = 0;
6357
Tim Peters7a29bd52001-09-12 03:03:31 +00006358 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359 /* no repeat, return original string */
6360 Py_INCREF(str);
6361 return (PyObject*) str;
6362 }
Tim Peters8f422462000-09-09 06:13:41 +00006363
6364 /* ensure # of chars needed doesn't overflow int and # of bytes
6365 * needed doesn't overflow size_t
6366 */
6367 nchars = len * str->length;
6368 if (len && nchars / len != str->length) {
6369 PyErr_SetString(PyExc_OverflowError,
6370 "repeated string is too long");
6371 return NULL;
6372 }
6373 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6374 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6375 PyErr_SetString(PyExc_OverflowError,
6376 "repeated string is too long");
6377 return NULL;
6378 }
6379 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 if (!u)
6381 return NULL;
6382
6383 p = u->str;
6384
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006385 if (str->length == 1 && len > 0) {
6386 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006387 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006388 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006389 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006390 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006391 done = str->length;
6392 }
6393 while (done < nchars) {
6394 int n = (done <= nchars-done) ? done : nchars-done;
6395 Py_UNICODE_COPY(p+done, p, n);
6396 done += n;
6397 }
6398 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399
6400 return (PyObject*) u;
6401}
6402
6403PyObject *PyUnicode_Replace(PyObject *obj,
6404 PyObject *subobj,
6405 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006406 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407{
6408 PyObject *self;
6409 PyObject *str1;
6410 PyObject *str2;
6411 PyObject *result;
6412
6413 self = PyUnicode_FromObject(obj);
6414 if (self == NULL)
6415 return NULL;
6416 str1 = PyUnicode_FromObject(subobj);
6417 if (str1 == NULL) {
6418 Py_DECREF(self);
6419 return NULL;
6420 }
6421 str2 = PyUnicode_FromObject(replobj);
6422 if (str2 == NULL) {
6423 Py_DECREF(self);
6424 Py_DECREF(str1);
6425 return NULL;
6426 }
Tim Petersced69f82003-09-16 20:30:58 +00006427 result = replace((PyUnicodeObject *)self,
6428 (PyUnicodeObject *)str1,
6429 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430 maxcount);
6431 Py_DECREF(self);
6432 Py_DECREF(str1);
6433 Py_DECREF(str2);
6434 return result;
6435}
6436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006437PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438"S.replace (old, new[, maxsplit]) -> unicode\n\
6439\n\
6440Return a copy of S with all occurrences of substring\n\
6441old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006442given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443
6444static PyObject*
6445unicode_replace(PyUnicodeObject *self, PyObject *args)
6446{
6447 PyUnicodeObject *str1;
6448 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006449 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 PyObject *result;
6451
Martin v. Löwis18e16552006-02-15 17:27:45 +00006452 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 return NULL;
6454 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6455 if (str1 == NULL)
6456 return NULL;
6457 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006458 if (str2 == NULL) {
6459 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006461 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462
6463 result = replace(self, str1, str2, maxcount);
6464
6465 Py_DECREF(str1);
6466 Py_DECREF(str2);
6467 return result;
6468}
6469
6470static
6471PyObject *unicode_repr(PyObject *unicode)
6472{
6473 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6474 PyUnicode_GET_SIZE(unicode),
6475 1);
6476}
6477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006478PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479"S.rfind(sub [,start [,end]]) -> int\n\
6480\n\
6481Return the highest index in S where substring sub is found,\n\
6482such that sub is contained within s[start,end]. Optional\n\
6483arguments start and end are interpreted as in slice notation.\n\
6484\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006485Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486
6487static PyObject *
6488unicode_rfind(PyUnicodeObject *self, PyObject *args)
6489{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006490 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006491 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006492 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006493 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494
Guido van Rossumb8872e62000-05-09 14:14:27 +00006495 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6496 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006498 substring = PyUnicode_FromObject(substring);
6499 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500 return NULL;
6501
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006502 result = stringlib_rfind_slice(
6503 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6504 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6505 start, end
6506 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507
6508 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006509
6510 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511}
6512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006513PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514"S.rindex(sub [,start [,end]]) -> int\n\
6515\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006516Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517
6518static PyObject *
6519unicode_rindex(PyUnicodeObject *self, PyObject *args)
6520{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006521 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006522 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006523 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006524 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525
Guido van Rossumb8872e62000-05-09 14:14:27 +00006526 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6527 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006529 substring = PyUnicode_FromObject(substring);
6530 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 return NULL;
6532
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006533 result = stringlib_rfind_slice(
6534 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6535 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6536 start, end
6537 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538
6539 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006540
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541 if (result < 0) {
6542 PyErr_SetString(PyExc_ValueError, "substring not found");
6543 return NULL;
6544 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006545 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546}
6547
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006548PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006549"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550\n\
6551Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006552done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553
6554static PyObject *
6555unicode_rjust(PyUnicodeObject *self, PyObject *args)
6556{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006557 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006558 Py_UNICODE fillchar = ' ';
6559
Martin v. Löwis412fb672006-04-13 06:34:32 +00006560 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 return NULL;
6562
Tim Peters7a29bd52001-09-12 03:03:31 +00006563 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564 Py_INCREF(self);
6565 return (PyObject*) self;
6566 }
6567
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006568 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569}
6570
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006572unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573{
6574 /* standard clamping */
6575 if (start < 0)
6576 start = 0;
6577 if (end < 0)
6578 end = 0;
6579 if (end > self->length)
6580 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006581 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 /* full slice, return original string */
6583 Py_INCREF(self);
6584 return (PyObject*) self;
6585 }
6586 if (start > end)
6587 start = end;
6588 /* copy slice */
6589 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6590 end - start);
6591}
6592
6593PyObject *PyUnicode_Split(PyObject *s,
6594 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006595 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596{
6597 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006598
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 s = PyUnicode_FromObject(s);
6600 if (s == NULL)
6601 return NULL;
6602 if (sep != NULL) {
6603 sep = PyUnicode_FromObject(sep);
6604 if (sep == NULL) {
6605 Py_DECREF(s);
6606 return NULL;
6607 }
6608 }
6609
6610 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6611
6612 Py_DECREF(s);
6613 Py_XDECREF(sep);
6614 return result;
6615}
6616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006617PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618"S.split([sep [,maxsplit]]) -> list of strings\n\
6619\n\
6620Return a list of the words in S, using sep as the\n\
6621delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006622splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006623any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624
6625static PyObject*
6626unicode_split(PyUnicodeObject *self, PyObject *args)
6627{
6628 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006629 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630
Martin v. Löwis18e16552006-02-15 17:27:45 +00006631 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 return NULL;
6633
6634 if (substring == Py_None)
6635 return split(self, NULL, maxcount);
6636 else if (PyUnicode_Check(substring))
6637 return split(self, (PyUnicodeObject *)substring, maxcount);
6638 else
6639 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6640}
6641
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006642PyObject *
6643PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6644{
6645 PyObject* str_obj;
6646 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006647 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00006648
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006649 str_obj = PyUnicode_FromObject(str_in);
6650 if (!str_obj)
6651 return NULL;
6652 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00006653 if (!sep_obj) {
6654 Py_DECREF(str_obj);
6655 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006656 }
6657
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006658 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00006659 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6660 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6661 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006662
Fredrik Lundhb9479482006-05-26 17:22:38 +00006663 Py_DECREF(sep_obj);
6664 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006665
6666 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006667}
6668
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006669
6670PyObject *
6671PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6672{
6673 PyObject* str_obj;
6674 PyObject* sep_obj;
6675 PyObject* out;
6676
6677 str_obj = PyUnicode_FromObject(str_in);
6678 if (!str_obj)
6679 return NULL;
6680 sep_obj = PyUnicode_FromObject(sep_in);
6681 if (!sep_obj) {
6682 Py_DECREF(str_obj);
6683 return NULL;
6684 }
6685
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006686 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006687 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6688 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6689 );
6690
6691 Py_DECREF(sep_obj);
6692 Py_DECREF(str_obj);
6693
6694 return out;
6695}
6696
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006697PyDoc_STRVAR(partition__doc__,
6698"S.partition(sep) -> (head, sep, tail)\n\
6699\n\
6700Searches for the separator sep in S, and returns the part before it,\n\
6701the separator itself, and the part after it. If the separator is not\n\
6702found, returns S and two empty strings.");
6703
6704static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00006705unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006706{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006707 return PyUnicode_Partition((PyObject *)self, separator);
6708}
6709
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006710PyDoc_STRVAR(rpartition__doc__,
Neal Norwitz29a5fdb2006-09-05 02:21:38 +00006711"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006712\n\
6713Searches for the separator sep in S, starting at the end of S, and returns\n\
6714the part before it, the separator itself, and the part after it. If the\n\
Neal Norwitz29a5fdb2006-09-05 02:21:38 +00006715separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006716
6717static PyObject*
6718unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6719{
6720 return PyUnicode_RPartition((PyObject *)self, separator);
6721}
6722
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006723PyObject *PyUnicode_RSplit(PyObject *s,
6724 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006725 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006726{
6727 PyObject *result;
6728
6729 s = PyUnicode_FromObject(s);
6730 if (s == NULL)
6731 return NULL;
6732 if (sep != NULL) {
6733 sep = PyUnicode_FromObject(sep);
6734 if (sep == NULL) {
6735 Py_DECREF(s);
6736 return NULL;
6737 }
6738 }
6739
6740 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6741
6742 Py_DECREF(s);
6743 Py_XDECREF(sep);
6744 return result;
6745}
6746
6747PyDoc_STRVAR(rsplit__doc__,
6748"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6749\n\
6750Return a list of the words in S, using sep as the\n\
6751delimiter string, starting at the end of the string and\n\
6752working to the front. If maxsplit is given, at most maxsplit\n\
6753splits are done. If sep is not specified, any whitespace string\n\
6754is a separator.");
6755
6756static PyObject*
6757unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6758{
6759 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006760 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006761
Martin v. Löwis18e16552006-02-15 17:27:45 +00006762 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006763 return NULL;
6764
6765 if (substring == Py_None)
6766 return rsplit(self, NULL, maxcount);
6767 else if (PyUnicode_Check(substring))
6768 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6769 else
6770 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6771}
6772
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006773PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006774"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775\n\
6776Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006777Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006778is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779
6780static PyObject*
6781unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6782{
Guido van Rossum86662912000-04-11 15:38:46 +00006783 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784
Guido van Rossum86662912000-04-11 15:38:46 +00006785 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 return NULL;
6787
Guido van Rossum86662912000-04-11 15:38:46 +00006788 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789}
6790
6791static
6792PyObject *unicode_str(PyUnicodeObject *self)
6793{
Fred Drakee4315f52000-05-09 19:53:39 +00006794 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795}
6796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006797PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798"S.swapcase() -> unicode\n\
6799\n\
6800Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006801and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802
6803static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006804unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806 return fixup(self, fixswapcase);
6807}
6808
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006809PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810"S.translate(table) -> unicode\n\
6811\n\
6812Return a copy of the string S, where all characters have been mapped\n\
6813through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006814Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6815Unmapped characters are left untouched. Characters mapped to None\n\
6816are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817
6818static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006819unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820{
Tim Petersced69f82003-09-16 20:30:58 +00006821 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006823 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824 "ignore");
6825}
6826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006827PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828"S.upper() -> unicode\n\
6829\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006830Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831
6832static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006833unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835 return fixup(self, fixupper);
6836}
6837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006838PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839"S.zfill(width) -> unicode\n\
6840\n\
6841Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006842of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843
6844static PyObject *
6845unicode_zfill(PyUnicodeObject *self, PyObject *args)
6846{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006847 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848 PyUnicodeObject *u;
6849
Martin v. Löwis18e16552006-02-15 17:27:45 +00006850 Py_ssize_t width;
6851 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852 return NULL;
6853
6854 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006855 if (PyUnicode_CheckExact(self)) {
6856 Py_INCREF(self);
6857 return (PyObject*) self;
6858 }
6859 else
6860 return PyUnicode_FromUnicode(
6861 PyUnicode_AS_UNICODE(self),
6862 PyUnicode_GET_SIZE(self)
6863 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864 }
6865
6866 fill = width - self->length;
6867
6868 u = pad(self, fill, 0, '0');
6869
Walter Dörwald068325e2002-04-15 13:36:47 +00006870 if (u == NULL)
6871 return NULL;
6872
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873 if (u->str[fill] == '+' || u->str[fill] == '-') {
6874 /* move sign to beginning of string */
6875 u->str[0] = u->str[fill];
6876 u->str[fill] = '0';
6877 }
6878
6879 return (PyObject*) u;
6880}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881
6882#if 0
6883static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006884unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886 return PyInt_FromLong(unicode_freelist_size);
6887}
6888#endif
6889
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006890PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006891"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006893Return True if S starts with the specified prefix, False otherwise.\n\
6894With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00006895With optional end, stop comparing S at that position.\n\
6896prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897
6898static PyObject *
6899unicode_startswith(PyUnicodeObject *self,
6900 PyObject *args)
6901{
Georg Brandl24250812006-06-09 18:45:48 +00006902 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006904 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006905 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00006906 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907
Georg Brandl24250812006-06-09 18:45:48 +00006908 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00006909 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00006911 if (PyTuple_Check(subobj)) {
6912 Py_ssize_t i;
6913 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6914 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6915 PyTuple_GET_ITEM(subobj, i));
6916 if (substring == NULL)
6917 return NULL;
6918 result = tailmatch(self, substring, start, end, -1);
6919 Py_DECREF(substring);
6920 if (result) {
6921 Py_RETURN_TRUE;
6922 }
6923 }
6924 /* nothing matched */
6925 Py_RETURN_FALSE;
6926 }
6927 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00006929 return NULL;
6930 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00006932 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933}
6934
6935
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006936PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006937"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006939Return True if S ends with the specified suffix, False otherwise.\n\
6940With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00006941With optional end, stop comparing S at that position.\n\
6942suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943
6944static PyObject *
6945unicode_endswith(PyUnicodeObject *self,
6946 PyObject *args)
6947{
Georg Brandl24250812006-06-09 18:45:48 +00006948 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006950 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006951 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00006952 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953
Georg Brandl24250812006-06-09 18:45:48 +00006954 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
6955 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00006957 if (PyTuple_Check(subobj)) {
6958 Py_ssize_t i;
6959 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6960 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6961 PyTuple_GET_ITEM(subobj, i));
6962 if (substring == NULL)
6963 return NULL;
6964 result = tailmatch(self, substring, start, end, +1);
6965 Py_DECREF(substring);
6966 if (result) {
6967 Py_RETURN_TRUE;
6968 }
6969 }
6970 Py_RETURN_FALSE;
6971 }
6972 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00006974 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975
Georg Brandl24250812006-06-09 18:45:48 +00006976 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00006978 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979}
6980
6981
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006982
6983static PyObject *
6984unicode_getnewargs(PyUnicodeObject *v)
6985{
6986 return Py_BuildValue("(u#)", v->str, v->length);
6987}
6988
6989
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990static PyMethodDef unicode_methods[] = {
6991
6992 /* Order is according to common usage: often used methods should
6993 appear first, since lookup is done sequentially. */
6994
Georg Brandlecdc0a92006-03-30 12:19:07 +00006995 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006996 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6997 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006998 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006999 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7000 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7001 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7002 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7003 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7004 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7005 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007006 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007007 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7008 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7009 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007010 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007011 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007012/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7013 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7014 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7015 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007016 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007017 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007018 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007019 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007020 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7021 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7022 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7023 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7024 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7025 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7026 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7027 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7028 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7029 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7030 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7031 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7032 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7033 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007034 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007035#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007036 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037#endif
7038
7039#if 0
7040 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007041 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042#endif
7043
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007044 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045 {NULL, NULL}
7046};
7047
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007048static PyObject *
7049unicode_mod(PyObject *v, PyObject *w)
7050{
7051 if (!PyUnicode_Check(v)) {
7052 Py_INCREF(Py_NotImplemented);
7053 return Py_NotImplemented;
7054 }
7055 return PyUnicode_Format(v, w);
7056}
7057
7058static PyNumberMethods unicode_as_number = {
7059 0, /*nb_add*/
7060 0, /*nb_subtract*/
7061 0, /*nb_multiply*/
7062 0, /*nb_divide*/
7063 unicode_mod, /*nb_remainder*/
7064};
7065
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007067 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007068 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007069 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7070 (ssizeargfunc) unicode_getitem, /* sq_item */
7071 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 0, /* sq_ass_item */
7073 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007074 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075};
7076
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007077static PyObject*
7078unicode_subscript(PyUnicodeObject* self, PyObject* item)
7079{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007080 if (PyIndex_Check(item)) {
7081 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007082 if (i == -1 && PyErr_Occurred())
7083 return NULL;
7084 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007085 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007086 return unicode_getitem(self, i);
7087 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007088 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007089 Py_UNICODE* source_buf;
7090 Py_UNICODE* result_buf;
7091 PyObject* result;
7092
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007093 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007094 &start, &stop, &step, &slicelength) < 0) {
7095 return NULL;
7096 }
7097
7098 if (slicelength <= 0) {
7099 return PyUnicode_FromUnicode(NULL, 0);
7100 } else {
7101 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00007102 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7103 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007104
7105 if (result_buf == NULL)
7106 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007107
7108 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7109 result_buf[i] = source_buf[cur];
7110 }
Tim Petersced69f82003-09-16 20:30:58 +00007111
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007112 result = PyUnicode_FromUnicode(result_buf, slicelength);
7113 PyMem_FREE(result_buf);
7114 return result;
7115 }
7116 } else {
7117 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7118 return NULL;
7119 }
7120}
7121
7122static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007123 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007124 (binaryfunc)unicode_subscript, /* mp_subscript */
7125 (objobjargproc)0, /* mp_ass_subscript */
7126};
7127
Martin v. Löwis18e16552006-02-15 17:27:45 +00007128static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007130 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131 const void **ptr)
7132{
7133 if (index != 0) {
7134 PyErr_SetString(PyExc_SystemError,
7135 "accessing non-existent unicode segment");
7136 return -1;
7137 }
7138 *ptr = (void *) self->str;
7139 return PyUnicode_GET_DATA_SIZE(self);
7140}
7141
Martin v. Löwis18e16552006-02-15 17:27:45 +00007142static Py_ssize_t
7143unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144 const void **ptr)
7145{
7146 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007147 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148 return -1;
7149}
7150
7151static int
7152unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007153 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154{
7155 if (lenp)
7156 *lenp = PyUnicode_GET_DATA_SIZE(self);
7157 return 1;
7158}
7159
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007160static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007162 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163 const void **ptr)
7164{
7165 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007166
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167 if (index != 0) {
7168 PyErr_SetString(PyExc_SystemError,
7169 "accessing non-existent unicode segment");
7170 return -1;
7171 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007172 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173 if (str == NULL)
7174 return -1;
7175 *ptr = (void *) PyString_AS_STRING(str);
7176 return PyString_GET_SIZE(str);
7177}
7178
7179/* Helpers for PyUnicode_Format() */
7180
7181static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007182getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007184 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185 if (argidx < arglen) {
7186 (*p_argidx)++;
7187 if (arglen < 0)
7188 return args;
7189 else
7190 return PyTuple_GetItem(args, argidx);
7191 }
7192 PyErr_SetString(PyExc_TypeError,
7193 "not enough arguments for format string");
7194 return NULL;
7195}
7196
7197#define F_LJUST (1<<0)
7198#define F_SIGN (1<<1)
7199#define F_BLANK (1<<2)
7200#define F_ALT (1<<3)
7201#define F_ZERO (1<<4)
7202
Martin v. Löwis18e16552006-02-15 17:27:45 +00007203static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007204strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007206 register Py_ssize_t i;
7207 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208 for (i = len - 1; i >= 0; i--)
7209 buffer[i] = (Py_UNICODE) charbuffer[i];
7210
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211 return len;
7212}
7213
Neal Norwitzfc76d632006-01-10 06:03:13 +00007214static int
7215doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7216{
Tim Peters15231542006-02-16 01:08:01 +00007217 Py_ssize_t result;
7218
Neal Norwitzfc76d632006-01-10 06:03:13 +00007219 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007220 result = strtounicode(buffer, (char *)buffer);
7221 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007222}
7223
7224static int
7225longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7226{
Tim Peters15231542006-02-16 01:08:01 +00007227 Py_ssize_t result;
7228
Neal Norwitzfc76d632006-01-10 06:03:13 +00007229 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007230 result = strtounicode(buffer, (char *)buffer);
7231 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007232}
7233
Guido van Rossum078151d2002-08-11 04:24:12 +00007234/* XXX To save some code duplication, formatfloat/long/int could have been
7235 shared with stringobject.c, converting from 8-bit to Unicode after the
7236 formatting is done. */
7237
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238static int
7239formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007240 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241 int flags,
7242 int prec,
7243 int type,
7244 PyObject *v)
7245{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007246 /* fmt = '%#.' + `prec` + `type`
7247 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248 char fmt[20];
7249 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007250
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251 x = PyFloat_AsDouble(v);
7252 if (x == -1.0 && PyErr_Occurred())
7253 return -1;
7254 if (prec < 0)
7255 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7257 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007258 /* Worst case length calc to ensure no buffer overrun:
7259
7260 'g' formats:
7261 fmt = %#.<prec>g
7262 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7263 for any double rep.)
7264 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7265
7266 'f' formats:
7267 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7268 len = 1 + 50 + 1 + prec = 52 + prec
7269
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007270 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007271 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007272
7273 */
7274 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7275 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007276 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007277 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007278 return -1;
7279 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007280 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7281 (flags&F_ALT) ? "#" : "",
7282 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007283 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284}
7285
Tim Peters38fd5b62000-09-21 05:43:11 +00007286static PyObject*
7287formatlong(PyObject *val, int flags, int prec, int type)
7288{
7289 char *buf;
7290 int i, len;
7291 PyObject *str; /* temporary string object. */
7292 PyUnicodeObject *result;
7293
7294 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7295 if (!str)
7296 return NULL;
7297 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007298 if (!result) {
7299 Py_DECREF(str);
7300 return NULL;
7301 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007302 for (i = 0; i < len; i++)
7303 result->str[i] = buf[i];
7304 result->str[len] = 0;
7305 Py_DECREF(str);
7306 return (PyObject*)result;
7307}
7308
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309static int
7310formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007311 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312 int flags,
7313 int prec,
7314 int type,
7315 PyObject *v)
7316{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007317 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007318 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7319 * + 1 + 1
7320 * = 24
7321 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007322 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007323 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324 long x;
7325
7326 x = PyInt_AsLong(v);
7327 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007328 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007329 if (x < 0 && type == 'u') {
7330 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007331 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007332 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7333 sign = "-";
7334 else
7335 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007337 prec = 1;
7338
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007339 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7340 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007341 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007342 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007343 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007344 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007345 return -1;
7346 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007347
7348 if ((flags & F_ALT) &&
7349 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007350 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007351 * of issues that cause pain:
7352 * - when 0 is being converted, the C standard leaves off
7353 * the '0x' or '0X', which is inconsistent with other
7354 * %#x/%#X conversions and inconsistent with Python's
7355 * hex() function
7356 * - there are platforms that violate the standard and
7357 * convert 0 with the '0x' or '0X'
7358 * (Metrowerks, Compaq Tru64)
7359 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007360 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007361 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007362 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007363 * We can achieve the desired consistency by inserting our
7364 * own '0x' or '0X' prefix, and substituting %x/%X in place
7365 * of %#x/%#X.
7366 *
7367 * Note that this is the same approach as used in
7368 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007369 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007370 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7371 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007372 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007373 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007374 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7375 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007376 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007377 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007378 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007379 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007380 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007381 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382}
7383
7384static int
7385formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007386 size_t buflen,
7387 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007389 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007390 if (PyUnicode_Check(v)) {
7391 if (PyUnicode_GET_SIZE(v) != 1)
7392 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007394 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007396 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007397 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007398 goto onError;
7399 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7400 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401
7402 else {
7403 /* Integer input truncated to a character */
7404 long x;
7405 x = PyInt_AsLong(v);
7406 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007407 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007408#ifdef Py_UNICODE_WIDE
7409 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007410 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007411 "%c arg not in range(0x110000) "
7412 "(wide Python build)");
7413 return -1;
7414 }
7415#else
7416 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007417 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007418 "%c arg not in range(0x10000) "
7419 "(narrow Python build)");
7420 return -1;
7421 }
7422#endif
7423 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424 }
7425 buf[1] = '\0';
7426 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007427
7428 onError:
7429 PyErr_SetString(PyExc_TypeError,
7430 "%c requires int or char");
7431 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432}
7433
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007434/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7435
7436 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7437 chars are formatted. XXX This is a magic number. Each formatting
7438 routine does bounds checking to ensure no overflow, but a better
7439 solution may be to malloc a buffer of appropriate size for each
7440 format. For now, the current solution is sufficient.
7441*/
7442#define FORMATBUFLEN (size_t)120
7443
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444PyObject *PyUnicode_Format(PyObject *format,
7445 PyObject *args)
7446{
7447 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007448 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449 int args_owned = 0;
7450 PyUnicodeObject *result = NULL;
7451 PyObject *dict = NULL;
7452 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007453
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454 if (format == NULL || args == NULL) {
7455 PyErr_BadInternalCall();
7456 return NULL;
7457 }
7458 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007459 if (uformat == NULL)
7460 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461 fmt = PyUnicode_AS_UNICODE(uformat);
7462 fmtcnt = PyUnicode_GET_SIZE(uformat);
7463
7464 reslen = rescnt = fmtcnt + 100;
7465 result = _PyUnicode_New(reslen);
7466 if (result == NULL)
7467 goto onError;
7468 res = PyUnicode_AS_UNICODE(result);
7469
7470 if (PyTuple_Check(args)) {
7471 arglen = PyTuple_Size(args);
7472 argidx = 0;
7473 }
7474 else {
7475 arglen = -1;
7476 argidx = -2;
7477 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007478 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7479 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480 dict = args;
7481
7482 while (--fmtcnt >= 0) {
7483 if (*fmt != '%') {
7484 if (--rescnt < 0) {
7485 rescnt = fmtcnt + 100;
7486 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007487 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007488 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7490 --rescnt;
7491 }
7492 *res++ = *fmt++;
7493 }
7494 else {
7495 /* Got a format specifier */
7496 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007497 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499 Py_UNICODE c = '\0';
7500 Py_UNICODE fill;
7501 PyObject *v = NULL;
7502 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007503 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007505 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007506 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507
7508 fmt++;
7509 if (*fmt == '(') {
7510 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007511 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512 PyObject *key;
7513 int pcount = 1;
7514
7515 if (dict == NULL) {
7516 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007517 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518 goto onError;
7519 }
7520 ++fmt;
7521 --fmtcnt;
7522 keystart = fmt;
7523 /* Skip over balanced parentheses */
7524 while (pcount > 0 && --fmtcnt >= 0) {
7525 if (*fmt == ')')
7526 --pcount;
7527 else if (*fmt == '(')
7528 ++pcount;
7529 fmt++;
7530 }
7531 keylen = fmt - keystart - 1;
7532 if (fmtcnt < 0 || pcount > 0) {
7533 PyErr_SetString(PyExc_ValueError,
7534 "incomplete format key");
7535 goto onError;
7536 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007537#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007538 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539 then looked up since Python uses strings to hold
7540 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007541 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542 key = PyUnicode_EncodeUTF8(keystart,
7543 keylen,
7544 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007545#else
7546 key = PyUnicode_FromUnicode(keystart, keylen);
7547#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548 if (key == NULL)
7549 goto onError;
7550 if (args_owned) {
7551 Py_DECREF(args);
7552 args_owned = 0;
7553 }
7554 args = PyObject_GetItem(dict, key);
7555 Py_DECREF(key);
7556 if (args == NULL) {
7557 goto onError;
7558 }
7559 args_owned = 1;
7560 arglen = -1;
7561 argidx = -2;
7562 }
7563 while (--fmtcnt >= 0) {
7564 switch (c = *fmt++) {
7565 case '-': flags |= F_LJUST; continue;
7566 case '+': flags |= F_SIGN; continue;
7567 case ' ': flags |= F_BLANK; continue;
7568 case '#': flags |= F_ALT; continue;
7569 case '0': flags |= F_ZERO; continue;
7570 }
7571 break;
7572 }
7573 if (c == '*') {
7574 v = getnextarg(args, arglen, &argidx);
7575 if (v == NULL)
7576 goto onError;
7577 if (!PyInt_Check(v)) {
7578 PyErr_SetString(PyExc_TypeError,
7579 "* wants int");
7580 goto onError;
7581 }
7582 width = PyInt_AsLong(v);
7583 if (width < 0) {
7584 flags |= F_LJUST;
7585 width = -width;
7586 }
7587 if (--fmtcnt >= 0)
7588 c = *fmt++;
7589 }
7590 else if (c >= '0' && c <= '9') {
7591 width = c - '0';
7592 while (--fmtcnt >= 0) {
7593 c = *fmt++;
7594 if (c < '0' || c > '9')
7595 break;
7596 if ((width*10) / 10 != width) {
7597 PyErr_SetString(PyExc_ValueError,
7598 "width too big");
7599 goto onError;
7600 }
7601 width = width*10 + (c - '0');
7602 }
7603 }
7604 if (c == '.') {
7605 prec = 0;
7606 if (--fmtcnt >= 0)
7607 c = *fmt++;
7608 if (c == '*') {
7609 v = getnextarg(args, arglen, &argidx);
7610 if (v == NULL)
7611 goto onError;
7612 if (!PyInt_Check(v)) {
7613 PyErr_SetString(PyExc_TypeError,
7614 "* wants int");
7615 goto onError;
7616 }
7617 prec = PyInt_AsLong(v);
7618 if (prec < 0)
7619 prec = 0;
7620 if (--fmtcnt >= 0)
7621 c = *fmt++;
7622 }
7623 else if (c >= '0' && c <= '9') {
7624 prec = c - '0';
7625 while (--fmtcnt >= 0) {
7626 c = Py_CHARMASK(*fmt++);
7627 if (c < '0' || c > '9')
7628 break;
7629 if ((prec*10) / 10 != prec) {
7630 PyErr_SetString(PyExc_ValueError,
7631 "prec too big");
7632 goto onError;
7633 }
7634 prec = prec*10 + (c - '0');
7635 }
7636 }
7637 } /* prec */
7638 if (fmtcnt >= 0) {
7639 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640 if (--fmtcnt >= 0)
7641 c = *fmt++;
7642 }
7643 }
7644 if (fmtcnt < 0) {
7645 PyErr_SetString(PyExc_ValueError,
7646 "incomplete format");
7647 goto onError;
7648 }
7649 if (c != '%') {
7650 v = getnextarg(args, arglen, &argidx);
7651 if (v == NULL)
7652 goto onError;
7653 }
7654 sign = 0;
7655 fill = ' ';
7656 switch (c) {
7657
7658 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007659 pbuf = formatbuf;
7660 /* presume that buffer length is at least 1 */
7661 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662 len = 1;
7663 break;
7664
7665 case 's':
7666 case 'r':
7667 if (PyUnicode_Check(v) && c == 's') {
7668 temp = v;
7669 Py_INCREF(temp);
7670 }
7671 else {
7672 PyObject *unicode;
7673 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007674 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675 else
7676 temp = PyObject_Repr(v);
7677 if (temp == NULL)
7678 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007679 if (PyUnicode_Check(temp))
7680 /* nothing to do */;
7681 else if (PyString_Check(temp)) {
7682 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007683 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007685 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007687 Py_DECREF(temp);
7688 temp = unicode;
7689 if (temp == NULL)
7690 goto onError;
7691 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007692 else {
7693 Py_DECREF(temp);
7694 PyErr_SetString(PyExc_TypeError,
7695 "%s argument has non-string str()");
7696 goto onError;
7697 }
7698 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007699 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700 len = PyUnicode_GET_SIZE(temp);
7701 if (prec >= 0 && len > prec)
7702 len = prec;
7703 break;
7704
7705 case 'i':
7706 case 'd':
7707 case 'u':
7708 case 'o':
7709 case 'x':
7710 case 'X':
7711 if (c == 'i')
7712 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007713 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007714 temp = formatlong(v, flags, prec, c);
7715 if (!temp)
7716 goto onError;
7717 pbuf = PyUnicode_AS_UNICODE(temp);
7718 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007719 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007721 else {
7722 pbuf = formatbuf;
7723 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7724 flags, prec, c, v);
7725 if (len < 0)
7726 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007727 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007728 }
7729 if (flags & F_ZERO)
7730 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731 break;
7732
7733 case 'e':
7734 case 'E':
7735 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007736 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737 case 'g':
7738 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007739 if (c == 'F')
7740 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007741 pbuf = formatbuf;
7742 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7743 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744 if (len < 0)
7745 goto onError;
7746 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007747 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 fill = '0';
7749 break;
7750
7751 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007752 pbuf = formatbuf;
7753 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754 if (len < 0)
7755 goto onError;
7756 break;
7757
7758 default:
7759 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007760 "unsupported format character '%c' (0x%x) "
7761 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007762 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007763 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007764 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765 goto onError;
7766 }
7767 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007768 if (*pbuf == '-' || *pbuf == '+') {
7769 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770 len--;
7771 }
7772 else if (flags & F_SIGN)
7773 sign = '+';
7774 else if (flags & F_BLANK)
7775 sign = ' ';
7776 else
7777 sign = 0;
7778 }
7779 if (width < len)
7780 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007781 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782 reslen -= rescnt;
7783 rescnt = width + fmtcnt + 100;
7784 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007785 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007786 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007787 PyErr_NoMemory();
7788 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007789 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007790 if (_PyUnicode_Resize(&result, reslen) < 0) {
7791 Py_XDECREF(temp);
7792 goto onError;
7793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794 res = PyUnicode_AS_UNICODE(result)
7795 + reslen - rescnt;
7796 }
7797 if (sign) {
7798 if (fill != ' ')
7799 *res++ = sign;
7800 rescnt--;
7801 if (width > len)
7802 width--;
7803 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007804 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7805 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007806 assert(pbuf[1] == c);
7807 if (fill != ' ') {
7808 *res++ = *pbuf++;
7809 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007810 }
Tim Petersfff53252001-04-12 18:38:48 +00007811 rescnt -= 2;
7812 width -= 2;
7813 if (width < 0)
7814 width = 0;
7815 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007816 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817 if (width > len && !(flags & F_LJUST)) {
7818 do {
7819 --rescnt;
7820 *res++ = fill;
7821 } while (--width > len);
7822 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007823 if (fill == ' ') {
7824 if (sign)
7825 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007826 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007827 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007828 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007829 *res++ = *pbuf++;
7830 *res++ = *pbuf++;
7831 }
7832 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007833 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834 res += len;
7835 rescnt -= len;
7836 while (--width >= len) {
7837 --rescnt;
7838 *res++ = ' ';
7839 }
7840 if (dict && (argidx < arglen) && c != '%') {
7841 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007842 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007843 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844 goto onError;
7845 }
7846 Py_XDECREF(temp);
7847 } /* '%' */
7848 } /* until end */
7849 if (argidx < arglen && !dict) {
7850 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007851 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852 goto onError;
7853 }
7854
Thomas Woutersa96affe2006-03-12 00:29:36 +00007855 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7856 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007857 if (args_owned) {
7858 Py_DECREF(args);
7859 }
7860 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861 return (PyObject *)result;
7862
7863 onError:
7864 Py_XDECREF(result);
7865 Py_DECREF(uformat);
7866 if (args_owned) {
7867 Py_DECREF(args);
7868 }
7869 return NULL;
7870}
7871
7872static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007873 (readbufferproc) unicode_buffer_getreadbuf,
7874 (writebufferproc) unicode_buffer_getwritebuf,
7875 (segcountproc) unicode_buffer_getsegcount,
7876 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877};
7878
Jeremy Hylton938ace62002-07-17 16:30:39 +00007879static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007880unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7881
Tim Peters6d6c1a32001-08-02 04:15:00 +00007882static PyObject *
7883unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7884{
7885 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007886 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007887 char *encoding = NULL;
7888 char *errors = NULL;
7889
Guido van Rossume023fe02001-08-30 03:12:59 +00007890 if (type != &PyUnicode_Type)
7891 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007892 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7893 kwlist, &x, &encoding, &errors))
7894 return NULL;
7895 if (x == NULL)
7896 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007897 if (encoding == NULL && errors == NULL)
7898 return PyObject_Unicode(x);
7899 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007900 return PyUnicode_FromEncodedObject(x, encoding, errors);
7901}
7902
Guido van Rossume023fe02001-08-30 03:12:59 +00007903static PyObject *
7904unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7905{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007906 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007907 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007908
7909 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7910 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7911 if (tmp == NULL)
7912 return NULL;
7913 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007914 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007915 if (pnew == NULL) {
7916 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007917 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007918 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007919 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7920 if (pnew->str == NULL) {
7921 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007922 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007923 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007924 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007925 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007926 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7927 pnew->length = n;
7928 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007929 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007930 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007931}
7932
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007933PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007934"unicode(string [, encoding[, errors]]) -> object\n\
7935\n\
7936Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007937encoding defaults to the current default string encoding.\n\
7938errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007939
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940PyTypeObject PyUnicode_Type = {
7941 PyObject_HEAD_INIT(&PyType_Type)
7942 0, /* ob_size */
7943 "unicode", /* tp_name */
7944 sizeof(PyUnicodeObject), /* tp_size */
7945 0, /* tp_itemsize */
7946 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007947 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007949 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00007951 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007952 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007953 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007955 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956 (hashfunc) unicode_hash, /* tp_hash*/
7957 0, /* tp_call*/
7958 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007959 PyObject_GenericGetAttr, /* tp_getattro */
7960 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007962 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7963 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007964 unicode_doc, /* tp_doc */
7965 0, /* tp_traverse */
7966 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00007967 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007968 0, /* tp_weaklistoffset */
7969 0, /* tp_iter */
7970 0, /* tp_iternext */
7971 unicode_methods, /* tp_methods */
7972 0, /* tp_members */
7973 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007974 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007975 0, /* tp_dict */
7976 0, /* tp_descr_get */
7977 0, /* tp_descr_set */
7978 0, /* tp_dictoffset */
7979 0, /* tp_init */
7980 0, /* tp_alloc */
7981 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007982 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983};
7984
7985/* Initialize the Unicode implementation */
7986
Thomas Wouters78890102000-07-22 19:25:51 +00007987void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007989 int i;
7990
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007991 /* XXX - move this array to unicodectype.c ? */
7992 Py_UNICODE linebreak[] = {
7993 0x000A, /* LINE FEED */
7994 0x000D, /* CARRIAGE RETURN */
7995 0x001C, /* FILE SEPARATOR */
7996 0x001D, /* GROUP SEPARATOR */
7997 0x001E, /* RECORD SEPARATOR */
7998 0x0085, /* NEXT LINE */
7999 0x2028, /* LINE SEPARATOR */
8000 0x2029, /* PARAGRAPH SEPARATOR */
8001 };
8002
Fred Drakee4315f52000-05-09 19:53:39 +00008003 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008004 unicode_freelist = NULL;
8005 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008007 if (!unicode_empty)
8008 return;
8009
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008010 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008011 for (i = 0; i < 256; i++)
8012 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008013 if (PyType_Ready(&PyUnicode_Type) < 0)
8014 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008015
8016 /* initialize the linebreak bloom filter */
8017 bloom_linebreak = make_bloom_mask(
8018 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8019 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008020
8021 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022}
8023
8024/* Finalize the Unicode implementation */
8025
8026void
Thomas Wouters78890102000-07-22 19:25:51 +00008027_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008029 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008030 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008032 Py_XDECREF(unicode_empty);
8033 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008034
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008035 for (i = 0; i < 256; i++) {
8036 if (unicode_latin1[i]) {
8037 Py_DECREF(unicode_latin1[i]);
8038 unicode_latin1[i] = NULL;
8039 }
8040 }
8041
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008042 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 PyUnicodeObject *v = u;
8044 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008045 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008046 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008047 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008048 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008050 unicode_freelist = NULL;
8051 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008053
Anthony Baxterac6bd462006-04-13 02:06:09 +00008054#ifdef __cplusplus
8055}
8056#endif
8057
8058
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008059/*
8060Local variables:
8061c-basic-offset: 4
8062indent-tabs-mode: nil
8063End:
8064*/