blob: f2582398a8e427e9ce5e057a26f1534706346c76 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
Fredrik Lundh77633512006-05-23 19:47:35 +0000166 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000187
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000199 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000205 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 PyErr_NoMemory();
207 return -1;
208 }
209 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000210 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000212 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 }
218 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000219
Guido van Rossumd57fd912000-03-10 22:53:23 +0000220 return 0;
221}
222
223/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000224 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
228
229*/
230
231static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233{
234 register PyUnicodeObject *unicode;
235
Andrew Dalkee0df7622006-05-27 11:04:36 +0000236 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
240 }
241
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000245 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000250 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000251 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 }
255 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000256 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000258 }
259 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266 }
267
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000268 if (!unicode->str) {
269 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000270 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000271 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000272 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
278 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000283 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000288 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290}
291
292static
Guido van Rossum9475a232001-10-05 20:51:39 +0000293void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000299 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 unicode->str = NULL;
301 unicode->length = 0;
302 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000306 }
307 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000313 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000314 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000315 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317}
318
Martin v. Löwis18e16552006-02-15 17:27:45 +0000319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000320{
321 register PyUnicodeObject *v;
322
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
327 }
328 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000329 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 PyErr_BadInternalCall();
331 return -1;
332 }
333
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000337 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000344 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000345 *unicode = (PyObject *)w;
346 return 0;
347 }
348
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360{
361 PyUnicodeObject *unicode;
362
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
366
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
371 }
372
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 if (!unicode)
380 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000381 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000382 unicode_latin1[*u] = unicode;
383 }
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
386 }
387 }
Tim Petersced69f82003-09-16 20:30:58 +0000388
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
392
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396
397 return (PyObject *)unicode;
398}
399
400#ifdef HAVE_WCHAR_H
401
402PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000403 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404{
405 PyUnicodeObject *unicode;
406
407 if (w == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
411
412 unicode = _PyUnicode_New(size);
413 if (!unicode)
414 return NULL;
415
416 /* Copy the wchar_t data into the new object */
417#ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000419#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000420 {
421 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000422 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000424 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 *u++ = *w++;
426 }
427#endif
428
429 return (PyObject *)unicode;
430}
431
Martin v. Löwis18e16552006-02-15 17:27:45 +0000432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433 wchar_t *w,
434 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435{
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
439 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000440
441 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000443 size = PyUnicode_GET_SIZE(unicode) + 1;
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445#ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w, unicode->str, size * sizeof(wchar_t));
447#else
448 {
449 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000450 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000452 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453 *w++ = *u++;
454 }
455#endif
456
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000457 if (size > PyUnicode_GET_SIZE(unicode))
458 return PyUnicode_GET_SIZE(unicode);
459 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 return size;
461}
462
463#endif
464
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000465PyObject *PyUnicode_FromOrdinal(int ordinal)
466{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000467 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000468
469#ifdef Py_UNICODE_WIDE
470 if (ordinal < 0 || ordinal > 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
474 return NULL;
475 }
476#else
477 if (ordinal < 0 || ordinal > 0xffff) {
478 PyErr_SetString(PyExc_ValueError,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
481 return NULL;
482 }
483#endif
484
Hye-Shik Chang40574832004-04-06 07:24:51 +0000485 s[0] = (Py_UNICODE)ordinal;
486 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000487}
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489PyObject *PyUnicode_FromObject(register PyObject *obj)
490{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj)) {
494 Py_INCREF(obj);
495 return obj;
496 }
497 if (PyUnicode_Check(obj)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501 PyUnicode_GET_SIZE(obj));
502 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000503 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504}
505
506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507 const char *encoding,
508 const char *errors)
509{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000511 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 if (obj == NULL) {
515 PyErr_BadInternalCall();
516 return NULL;
517 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000518
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000519#if 0
520 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
523 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000524
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
527
528 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529 if (PyUnicode_Check(obj)) {
530 if (encoding) {
531 PyErr_SetString(PyExc_TypeError,
532 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000533 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000536 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537#else
538 if (PyUnicode_Check(obj)) {
539 PyErr_SetString(PyExc_TypeError,
540 "decoding Unicode is not supported");
541 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000542 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000543#endif
544
545 /* Coerce object */
546 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000547 s = PyString_AS_STRING(obj);
548 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000549 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000550 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 "coercing to Unicode: need string or buffer, "
556 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000557 obj->ob_type->tp_name);
558 goto onError;
559 }
Tim Petersced69f82003-09-16 20:30:58 +0000560
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000561 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 if (len == 0) {
563 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000564 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 }
Tim Petersced69f82003-09-16 20:30:58 +0000566 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000567 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000568
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 return v;
570
571 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573}
574
575PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000576 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 const char *encoding,
578 const char *errors)
579{
580 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000581
582 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000583 encoding = PyUnicode_GetDefaultEncoding();
584
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000587 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000588 else if (strcmp(encoding, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s, size, errors);
593#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596
597 /* Decode via the codec registry */
598 buffer = PyBuffer_FromMemory((void *)s, size);
599 if (buffer == NULL)
600 goto onError;
601 unicode = PyCodec_Decode(buffer, encoding, errors);
602 if (unicode == NULL)
603 goto onError;
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000606 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607 unicode->ob_type->tp_name);
608 Py_DECREF(unicode);
609 goto onError;
610 }
611 Py_DECREF(buffer);
612 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000613
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 onError:
615 Py_XDECREF(buffer);
616 return NULL;
617}
618
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620 const char *encoding,
621 const char *errors)
622{
623 PyObject *v;
624
625 if (!PyUnicode_Check(unicode)) {
626 PyErr_BadArgument();
627 goto onError;
628 }
629
630 if (encoding == NULL)
631 encoding = PyUnicode_GetDefaultEncoding();
632
633 /* Decode via the codec registry */
634 v = PyCodec_Decode(unicode, encoding, errors);
635 if (v == NULL)
636 goto onError;
637 return v;
638
639 onError:
640 return NULL;
641}
642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 const char *encoding,
646 const char *errors)
647{
648 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = PyUnicode_FromUnicode(s, size);
651 if (unicode == NULL)
652 return NULL;
653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654 Py_DECREF(unicode);
655 return v;
656}
657
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659 const char *encoding,
660 const char *errors)
661{
662 PyObject *v;
663
664 if (!PyUnicode_Check(unicode)) {
665 PyErr_BadArgument();
666 goto onError;
667 }
668
669 if (encoding == NULL)
670 encoding = PyUnicode_GetDefaultEncoding();
671
672 /* Encode via the codec registry */
673 v = PyCodec_Encode(unicode, encoding, errors);
674 if (v == NULL)
675 goto onError;
676 return v;
677
678 onError:
679 return NULL;
680}
681
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683 const char *encoding,
684 const char *errors)
685{
686 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000687
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688 if (!PyUnicode_Check(unicode)) {
689 PyErr_BadArgument();
690 goto onError;
691 }
Fred Drakee4315f52000-05-09 19:53:39 +0000692
Tim Petersced69f82003-09-16 20:30:58 +0000693 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000694 encoding = PyUnicode_GetDefaultEncoding();
695
696 /* Shortcuts for common default encodings */
697 if (errors == NULL) {
698 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000699 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000700 else if (strcmp(encoding, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode);
705#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000706 else if (strcmp(encoding, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode);
708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709
710 /* Encode via the codec registry */
711 v = PyCodec_Encode(unicode, encoding, errors);
712 if (v == NULL)
713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 if (!PyString_Check(v)) {
715 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000716 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 v->ob_type->tp_name);
718 Py_DECREF(v);
719 goto onError;
720 }
721 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000722
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 onError:
724 return NULL;
725}
726
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728 const char *errors)
729{
730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
731
732 if (v)
733 return v;
734 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735 if (v && errors == NULL)
736 ((PyUnicodeObject *)unicode)->defenc = v;
737 return v;
738}
739
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
741{
742 if (!PyUnicode_Check(unicode)) {
743 PyErr_BadArgument();
744 goto onError;
745 }
746 return PyUnicode_AS_UNICODE(unicode);
747
748 onError:
749 return NULL;
750}
751
Martin v. Löwis18e16552006-02-15 17:27:45 +0000752Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753{
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
757 }
758 return PyUnicode_GET_SIZE(unicode);
759
760 onError:
761 return -1;
762}
763
Thomas Wouters78890102000-07-22 19:25:51 +0000764const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000765{
766 return unicode_default_encoding;
767}
768
769int PyUnicode_SetDefaultEncoding(const char *encoding)
770{
771 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000772
Fred Drakee4315f52000-05-09 19:53:39 +0000773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v = _PyCodec_Lookup(encoding);
776 if (v == NULL)
777 goto onError;
778 Py_DECREF(v);
779 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000780 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000781 sizeof(unicode_default_encoding));
782 return 0;
783
784 onError:
785 return -1;
786}
787
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000788/* error handling callback helper:
789 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000790 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000791 and adjust various state variables.
792 return 0 on success, -1 on error
793*/
794
795static
796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
799 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000800{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000801 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000802
803 PyObject *restuple = NULL;
804 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000805 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
806 Py_ssize_t requiredsize;
807 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000808 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000809 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000810 int res = -1;
811
812 if (*errorHandler == NULL) {
813 *errorHandler = PyCodec_LookupError(errors);
814 if (*errorHandler == NULL)
815 goto onError;
816 }
817
818 if (*exceptionObject == NULL) {
819 *exceptionObject = PyUnicodeDecodeError_Create(
820 encoding, input, insize, *startinpos, *endinpos, reason);
821 if (*exceptionObject == NULL)
822 goto onError;
823 }
824 else {
825 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
826 goto onError;
827 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
828 goto onError;
829 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
830 goto onError;
831 }
832
833 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
834 if (restuple == NULL)
835 goto onError;
836 if (!PyTuple_Check(restuple)) {
837 PyErr_Format(PyExc_TypeError, &argparse[4]);
838 goto onError;
839 }
840 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
841 goto onError;
842 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000843 newpos = insize+newpos;
844 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000845 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000846 goto onError;
847 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000848
849 /* need more space? (at least enough for what we
850 have+the replacement+the rest of the string (starting
851 at the new input position), so we won't have to check space
852 when there are no errors in the rest of the string) */
853 repptr = PyUnicode_AS_UNICODE(repunicode);
854 repsize = PyUnicode_GET_SIZE(repunicode);
855 requiredsize = *outpos + repsize + insize-newpos;
856 if (requiredsize > outsize) {
857 if (requiredsize<2*outsize)
858 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000859 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000860 goto onError;
861 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
862 }
863 *endinpos = newpos;
864 *inptr = input + newpos;
865 Py_UNICODE_COPY(*outptr, repptr, repsize);
866 *outptr += repsize;
867 *outpos += repsize;
868 /* we made it! */
869 res = 0;
870
871 onError:
872 Py_XDECREF(restuple);
873 return res;
874}
875
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000876/* --- UTF-7 Codec -------------------------------------------------------- */
877
878/* see RFC2152 for details */
879
Tim Petersced69f82003-09-16 20:30:58 +0000880static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881char utf7_special[128] = {
882 /* indicate whether a UTF-7 character is special i.e. cannot be directly
883 encoded:
884 0 - not special
885 1 - special
886 2 - whitespace (optional)
887 3 - RFC2152 Set O (optional) */
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
892 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
894 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
896
897};
898
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000899/* Note: The comparison (c) <= 0 is a trick to work-around gcc
900 warnings about the comparison always being false; since
901 utf7_special[0] is 1, we can safely make that one comparison
902 true */
903
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000904#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000905 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000906 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907 (encodeO && (utf7_special[(c)] == 3)))
908
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000909#define B64(n) \
910 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
911#define B64CHAR(c) \
912 (isalnum(c) || (c) == '+' || (c) == '/')
913#define UB64(c) \
914 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
915 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000916
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000917#define ENCODE(out, ch, bits) \
918 while (bits >= 6) { \
919 *out++ = B64(ch >> (bits-6)); \
920 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921 }
922
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000923#define DECODE(out, ch, bits, surrogate) \
924 while (bits >= 16) { \
925 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
926 bits -= 16; \
927 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000928 /* We have already generated an error for the high surrogate \
929 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000930 surrogate = 0; \
931 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000932 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000933 it in a 16-bit character */ \
934 surrogate = 1; \
935 errmsg = "code pairs are not supported"; \
936 goto utf7Error; \
937 } else { \
938 *out++ = outCh; \
939 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000940 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000941
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000943 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 const char *errors)
945{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000946 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000947 Py_ssize_t startinpos;
948 Py_ssize_t endinpos;
949 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950 const char *e;
951 PyUnicodeObject *unicode;
952 Py_UNICODE *p;
953 const char *errmsg = "";
954 int inShift = 0;
955 unsigned int bitsleft = 0;
956 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000957 int surrogate = 0;
958 PyObject *errorHandler = NULL;
959 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000960
961 unicode = _PyUnicode_New(size);
962 if (!unicode)
963 return NULL;
964 if (size == 0)
965 return (PyObject *)unicode;
966
967 p = unicode->str;
968 e = s + size;
969
970 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000971 Py_UNICODE ch;
972 restart:
973 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974
975 if (inShift) {
976 if ((ch == '-') || !B64CHAR(ch)) {
977 inShift = 0;
978 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000979
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
981 if (bitsleft >= 6) {
982 /* The shift sequence has a partial character in it. If
983 bitsleft < 6 then we could just classify it as padding
984 but that is not the case here */
985
986 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000987 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000988 }
989 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000990 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000991 here so indicate the potential of a misencoded character. */
992
993 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
994 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
995 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000996 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 }
998
999 if (ch == '-') {
1000 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001001 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 inShift = 1;
1003 }
1004 } else if (SPECIAL(ch,0,0)) {
1005 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001006 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 } else {
1008 *p++ = ch;
1009 }
1010 } else {
1011 charsleft = (charsleft << 6) | UB64(ch);
1012 bitsleft += 6;
1013 s++;
1014 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1015 }
1016 }
1017 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001019 s++;
1020 if (s < e && *s == '-') {
1021 s++;
1022 *p++ = '+';
1023 } else
1024 {
1025 inShift = 1;
1026 bitsleft = 0;
1027 }
1028 }
1029 else if (SPECIAL(ch,0,0)) {
1030 errmsg = "unexpected special character";
1031 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001032 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001033 }
1034 else {
1035 *p++ = ch;
1036 s++;
1037 }
1038 continue;
1039 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001040 outpos = p-PyUnicode_AS_UNICODE(unicode);
1041 endinpos = s-starts;
1042 if (unicode_decode_call_errorhandler(
1043 errors, &errorHandler,
1044 "utf7", errmsg,
1045 starts, size, &startinpos, &endinpos, &exc, &s,
1046 (PyObject **)&unicode, &outpos, &p))
1047 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001048 }
1049
1050 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001051 outpos = p-PyUnicode_AS_UNICODE(unicode);
1052 endinpos = size;
1053 if (unicode_decode_call_errorhandler(
1054 errors, &errorHandler,
1055 "utf7", "unterminated shift sequence",
1056 starts, size, &startinpos, &endinpos, &exc, &s,
1057 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001059 if (s < e)
1060 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001063 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001064 goto onError;
1065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001066 Py_XDECREF(errorHandler);
1067 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 return (PyObject *)unicode;
1069
1070onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071 Py_XDECREF(errorHandler);
1072 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 Py_DECREF(unicode);
1074 return NULL;
1075}
1076
1077
1078PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001079 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080 int encodeSetO,
1081 int encodeWhiteSpace,
1082 const char *errors)
1083{
1084 PyObject *v;
1085 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001086 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001087 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001089 unsigned int bitsleft = 0;
1090 unsigned long charsleft = 0;
1091 char * out;
1092 char * start;
1093
1094 if (size == 0)
1095 return PyString_FromStringAndSize(NULL, 0);
1096
1097 v = PyString_FromStringAndSize(NULL, cbAllocated);
1098 if (v == NULL)
1099 return NULL;
1100
1101 start = out = PyString_AS_STRING(v);
1102 for (;i < size; ++i) {
1103 Py_UNICODE ch = s[i];
1104
1105 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001106 if (ch == '+') {
1107 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001108 *out++ = '-';
1109 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1110 charsleft = ch;
1111 bitsleft = 16;
1112 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001113 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001114 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001115 } else {
1116 *out++ = (char) ch;
1117 }
1118 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1120 *out++ = B64(charsleft << (6-bitsleft));
1121 charsleft = 0;
1122 bitsleft = 0;
1123 /* Characters not in the BASE64 set implicitly unshift the sequence
1124 so no '-' is required, except if the character is itself a '-' */
1125 if (B64CHAR(ch) || ch == '-') {
1126 *out++ = '-';
1127 }
1128 inShift = 0;
1129 *out++ = (char) ch;
1130 } else {
1131 bitsleft += 16;
1132 charsleft = (charsleft << 16) | ch;
1133 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1134
1135 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001136 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001137 or '-' then the shift sequence will be terminated implicitly and we
1138 don't have to insert a '-'. */
1139
1140 if (bitsleft == 0) {
1141 if (i + 1 < size) {
1142 Py_UNICODE ch2 = s[i+1];
1143
1144 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001145
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001146 } else if (B64CHAR(ch2) || ch2 == '-') {
1147 *out++ = '-';
1148 inShift = 0;
1149 } else {
1150 inShift = 0;
1151 }
1152
1153 }
1154 else {
1155 *out++ = '-';
1156 inShift = 0;
1157 }
1158 }
Tim Petersced69f82003-09-16 20:30:58 +00001159 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001160 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001161 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001162 if (bitsleft) {
1163 *out++= B64(charsleft << (6-bitsleft) );
1164 *out++ = '-';
1165 }
1166
Tim Peters5de98422002-04-27 18:44:32 +00001167 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001168 return v;
1169}
1170
1171#undef SPECIAL
1172#undef B64
1173#undef B64CHAR
1174#undef UB64
1175#undef ENCODE
1176#undef DECODE
1177
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178/* --- UTF-8 Codec -------------------------------------------------------- */
1179
Tim Petersced69f82003-09-16 20:30:58 +00001180static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181char utf8_code_length[256] = {
1182 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1183 illegal prefix. see RFC 2279 for details */
1184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1200};
1201
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001203 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 const char *errors)
1205{
Walter Dörwald69652032004-09-07 20:24:22 +00001206 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1207}
1208
1209PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001210 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001211 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001212 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001213{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001214 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001216 Py_ssize_t startinpos;
1217 Py_ssize_t endinpos;
1218 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 const char *e;
1220 PyUnicodeObject *unicode;
1221 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001222 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001223 PyObject *errorHandler = NULL;
1224 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
1226 /* Note: size will always be longer than the resulting Unicode
1227 character count */
1228 unicode = _PyUnicode_New(size);
1229 if (!unicode)
1230 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001231 if (size == 0) {
1232 if (consumed)
1233 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236
1237 /* Unpack UTF-8 encoded data */
1238 p = unicode->str;
1239 e = s + size;
1240
1241 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001242 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243
1244 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001245 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 s++;
1247 continue;
1248 }
1249
1250 n = utf8_code_length[ch];
1251
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001252 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001253 if (consumed)
1254 break;
1255 else {
1256 errmsg = "unexpected end of data";
1257 startinpos = s-starts;
1258 endinpos = size;
1259 goto utf8Error;
1260 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262
1263 switch (n) {
1264
1265 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001266 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001267 startinpos = s-starts;
1268 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001269 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001270
1271 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001272 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273 startinpos = s-starts;
1274 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001275 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276
1277 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001278 if ((s[1] & 0xc0) != 0x80) {
1279 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
1283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 errmsg = "illegal encoding";
1289 goto utf8Error;
1290 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001292 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 break;
1294
1295 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001296 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 (s[2] & 0xc0) != 0x80) {
1298 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299 startinpos = s-starts;
1300 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 goto utf8Error;
1302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001304 if (ch < 0x0800) {
1305 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001306 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001307
1308 XXX For wide builds (UCS-4) we should probably try
1309 to recombine the surrogates into a single code
1310 unit.
1311 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001312 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001313 startinpos = s-starts;
1314 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001315 goto utf8Error;
1316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001318 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001319 break;
1320
1321 case 4:
1322 if ((s[1] & 0xc0) != 0x80 ||
1323 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001324 (s[3] & 0xc0) != 0x80) {
1325 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 startinpos = s-starts;
1327 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001328 goto utf8Error;
1329 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001330 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1331 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1332 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001333 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001334 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001335 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001336 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001337 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001338 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 startinpos = s-starts;
1340 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001341 goto utf8Error;
1342 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001343#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001344 *p++ = (Py_UNICODE)ch;
1345#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001346 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001347
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 /* translate from 10000..10FFFF to 0..FFFF */
1349 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001350
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001351 /* high surrogate = top 10 bits added to D800 */
1352 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001353
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001354 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001355 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001356#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 break;
1358
1359 default:
1360 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001361 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001362 startinpos = s-starts;
1363 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001364 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 }
1366 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001367 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001368
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001370 outpos = p-PyUnicode_AS_UNICODE(unicode);
1371 if (unicode_decode_call_errorhandler(
1372 errors, &errorHandler,
1373 "utf8", errmsg,
1374 starts, size, &startinpos, &endinpos, &exc, &s,
1375 (PyObject **)&unicode, &outpos, &p))
1376 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 }
Walter Dörwald69652032004-09-07 20:24:22 +00001378 if (consumed)
1379 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380
1381 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001382 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001383 goto onError;
1384
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001385 Py_XDECREF(errorHandler);
1386 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 return (PyObject *)unicode;
1388
1389onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 Py_XDECREF(errorHandler);
1391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 Py_DECREF(unicode);
1393 return NULL;
1394}
1395
Tim Peters602f7402002-04-27 18:03:26 +00001396/* Allocation strategy: if the string is short, convert into a stack buffer
1397 and allocate exactly as much space needed at the end. Else allocate the
1398 maximum possible needed (4 result bytes per Unicode character), and return
1399 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001400*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001401PyObject *
1402PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001403 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001404 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405{
Tim Peters602f7402002-04-27 18:03:26 +00001406#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001407
Martin v. Löwis18e16552006-02-15 17:27:45 +00001408 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001409 PyObject *v; /* result string object */
1410 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001412 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001413 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001414
Tim Peters602f7402002-04-27 18:03:26 +00001415 assert(s != NULL);
1416 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417
Tim Peters602f7402002-04-27 18:03:26 +00001418 if (size <= MAX_SHORT_UNICHARS) {
1419 /* Write into the stack buffer; nallocated can't overflow.
1420 * At the end, we'll allocate exactly as much heap space as it
1421 * turns out we need.
1422 */
1423 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1424 v = NULL; /* will allocate after we're done */
1425 p = stackbuf;
1426 }
1427 else {
1428 /* Overallocate on the heap, and give the excess back at the end. */
1429 nallocated = size * 4;
1430 if (nallocated / 4 != size) /* overflow! */
1431 return PyErr_NoMemory();
1432 v = PyString_FromStringAndSize(NULL, nallocated);
1433 if (v == NULL)
1434 return NULL;
1435 p = PyString_AS_STRING(v);
1436 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001437
Tim Peters602f7402002-04-27 18:03:26 +00001438 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001439 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001440
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001441 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001442 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001444
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001446 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001447 *p++ = (char)(0xc0 | (ch >> 6));
1448 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001449 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001450 else {
Tim Peters602f7402002-04-27 18:03:26 +00001451 /* Encode UCS2 Unicode ordinals */
1452 if (ch < 0x10000) {
1453 /* Special case: check for high surrogate */
1454 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1455 Py_UCS4 ch2 = s[i];
1456 /* Check for low surrogate and combine the two to
1457 form a UCS4 value */
1458 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001459 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001460 i++;
1461 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001462 }
Tim Peters602f7402002-04-27 18:03:26 +00001463 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001464 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001465 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001466 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1467 *p++ = (char)(0x80 | (ch & 0x3f));
1468 continue;
1469 }
1470encodeUCS4:
1471 /* Encode UCS4 Unicode ordinals */
1472 *p++ = (char)(0xf0 | (ch >> 18));
1473 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1474 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1475 *p++ = (char)(0x80 | (ch & 0x3f));
1476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001478
Tim Peters602f7402002-04-27 18:03:26 +00001479 if (v == NULL) {
1480 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001481 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001482 assert(nneeded <= nallocated);
1483 v = PyString_FromStringAndSize(stackbuf, nneeded);
1484 }
1485 else {
1486 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001487 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001488 assert(nneeded <= nallocated);
1489 _PyString_Resize(&v, nneeded);
1490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001492
Tim Peters602f7402002-04-27 18:03:26 +00001493#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494}
1495
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1497{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
1500 return NULL;
1501 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001502 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1503 PyUnicode_GET_SIZE(unicode),
1504 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505}
1506
1507/* --- UTF-16 Codec ------------------------------------------------------- */
1508
Tim Peters772747b2001-08-09 22:21:55 +00001509PyObject *
1510PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001511 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001512 const char *errors,
1513 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001514{
Walter Dörwald69652032004-09-07 20:24:22 +00001515 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1516}
1517
1518PyObject *
1519PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001520 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001521 const char *errors,
1522 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001523 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001524{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001526 Py_ssize_t startinpos;
1527 Py_ssize_t endinpos;
1528 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529 PyUnicodeObject *unicode;
1530 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001531 const unsigned char *q, *e;
1532 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001533 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001534 /* Offsets from q for retrieving byte pairs in the right order. */
1535#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536 int ihi = 1, ilo = 0;
1537#else
1538 int ihi = 0, ilo = 1;
1539#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 PyObject *errorHandler = NULL;
1541 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542
1543 /* Note: size will always be longer than the resulting Unicode
1544 character count */
1545 unicode = _PyUnicode_New(size);
1546 if (!unicode)
1547 return NULL;
1548 if (size == 0)
1549 return (PyObject *)unicode;
1550
1551 /* Unpack UTF-16 encoded data */
1552 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001553 q = (unsigned char *)s;
1554 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555
1556 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001557 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001559 /* Check for BOM marks (U+FEFF) in the input and adjust current
1560 byte order setting accordingly. In native mode, the leading BOM
1561 mark is skipped, in all other modes, it is copied to the output
1562 stream as-is (giving a ZWNBSP character). */
1563 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001564 if (size >= 2) {
1565 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001566#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001567 if (bom == 0xFEFF) {
1568 q += 2;
1569 bo = -1;
1570 }
1571 else if (bom == 0xFFFE) {
1572 q += 2;
1573 bo = 1;
1574 }
Tim Petersced69f82003-09-16 20:30:58 +00001575#else
Walter Dörwald69652032004-09-07 20:24:22 +00001576 if (bom == 0xFEFF) {
1577 q += 2;
1578 bo = 1;
1579 }
1580 else if (bom == 0xFFFE) {
1581 q += 2;
1582 bo = -1;
1583 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001584#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001585 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587
Tim Peters772747b2001-08-09 22:21:55 +00001588 if (bo == -1) {
1589 /* force LE */
1590 ihi = 1;
1591 ilo = 0;
1592 }
1593 else if (bo == 1) {
1594 /* force BE */
1595 ihi = 0;
1596 ilo = 1;
1597 }
1598
1599 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001601 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001602 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001603 if (consumed)
1604 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001605 errmsg = "truncated data";
1606 startinpos = ((const char *)q)-starts;
1607 endinpos = ((const char *)e)-starts;
1608 goto utf16Error;
1609 /* The remaining input chars are ignored if the callback
1610 chooses to skip the input */
1611 }
1612 ch = (q[ihi] << 8) | q[ilo];
1613
Tim Peters772747b2001-08-09 22:21:55 +00001614 q += 2;
1615
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 if (ch < 0xD800 || ch > 0xDFFF) {
1617 *p++ = ch;
1618 continue;
1619 }
1620
1621 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001622 if (q >= e) {
1623 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001624 startinpos = (((const char *)q)-2)-starts;
1625 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001626 goto utf16Error;
1627 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001628 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001629 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1630 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001631 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001632#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001633 *p++ = ch;
1634 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001635#else
1636 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001637#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001638 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001639 }
1640 else {
1641 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 startinpos = (((const char *)q)-4)-starts;
1643 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001644 goto utf16Error;
1645 }
1646
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001648 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 startinpos = (((const char *)q)-2)-starts;
1650 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001651 /* Fall through to report the error */
1652
1653 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001654 outpos = p-PyUnicode_AS_UNICODE(unicode);
1655 if (unicode_decode_call_errorhandler(
1656 errors, &errorHandler,
1657 "utf16", errmsg,
1658 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1659 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 }
1662
1663 if (byteorder)
1664 *byteorder = bo;
1665
Walter Dörwald69652032004-09-07 20:24:22 +00001666 if (consumed)
1667 *consumed = (const char *)q-starts;
1668
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001670 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671 goto onError;
1672
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001673 Py_XDECREF(errorHandler);
1674 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675 return (PyObject *)unicode;
1676
1677onError:
1678 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001679 Py_XDECREF(errorHandler);
1680 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 return NULL;
1682}
1683
Tim Peters772747b2001-08-09 22:21:55 +00001684PyObject *
1685PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001686 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001687 const char *errors,
1688 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001689{
1690 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001691 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001692#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001693 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001694#else
1695 const int pairs = 0;
1696#endif
Tim Peters772747b2001-08-09 22:21:55 +00001697 /* Offsets from p for storing byte pairs in the right order. */
1698#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699 int ihi = 1, ilo = 0;
1700#else
1701 int ihi = 0, ilo = 1;
1702#endif
1703
1704#define STORECHAR(CH) \
1705 do { \
1706 p[ihi] = ((CH) >> 8) & 0xff; \
1707 p[ilo] = (CH) & 0xff; \
1708 p += 2; \
1709 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001711#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001712 for (i = pairs = 0; i < size; i++)
1713 if (s[i] >= 0x10000)
1714 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001715#endif
Tim Petersced69f82003-09-16 20:30:58 +00001716 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001717 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718 if (v == NULL)
1719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720
Tim Peters772747b2001-08-09 22:21:55 +00001721 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001723 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001724 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001725 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001726
1727 if (byteorder == -1) {
1728 /* force LE */
1729 ihi = 1;
1730 ilo = 0;
1731 }
1732 else if (byteorder == 1) {
1733 /* force BE */
1734 ihi = 0;
1735 ilo = 1;
1736 }
1737
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001738 while (size-- > 0) {
1739 Py_UNICODE ch = *s++;
1740 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001741#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001742 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001743 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1744 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001746#endif
Tim Peters772747b2001-08-09 22:21:55 +00001747 STORECHAR(ch);
1748 if (ch2)
1749 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001752#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753}
1754
1755PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1756{
1757 if (!PyUnicode_Check(unicode)) {
1758 PyErr_BadArgument();
1759 return NULL;
1760 }
1761 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1762 PyUnicode_GET_SIZE(unicode),
1763 NULL,
1764 0);
1765}
1766
1767/* --- Unicode Escape Codec ----------------------------------------------- */
1768
Fredrik Lundh06d12682001-01-24 07:59:11 +00001769static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001770
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001772 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773 const char *errors)
1774{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001776 Py_ssize_t startinpos;
1777 Py_ssize_t endinpos;
1778 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001781 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001783 char* message;
1784 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001785 PyObject *errorHandler = NULL;
1786 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001787
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 /* Escaped strings will always be longer than the resulting
1789 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 length after conversion to the true value.
1791 (but if the error callback returns a long replacement string
1792 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 v = _PyUnicode_New(size);
1794 if (v == NULL)
1795 goto onError;
1796 if (size == 0)
1797 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001798
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001799 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001801
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 while (s < end) {
1803 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001804 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001805 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806
1807 /* Non-escape characters are interpreted as Unicode ordinals */
1808 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001809 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 continue;
1811 }
1812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001813 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 /* \ - Escapes */
1815 s++;
Georg Brandl1dcb9c92007-11-02 22:46:38 +00001816 c = *s++;
1817 if (s > end)
1818 c = '\0'; /* Invalid after \ */
1819 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001820
1821 /* \x escapes */
1822 case '\n': break;
1823 case '\\': *p++ = '\\'; break;
1824 case '\'': *p++ = '\''; break;
1825 case '\"': *p++ = '\"'; break;
1826 case 'b': *p++ = '\b'; break;
1827 case 'f': *p++ = '\014'; break; /* FF */
1828 case 't': *p++ = '\t'; break;
1829 case 'n': *p++ = '\n'; break;
1830 case 'r': *p++ = '\r'; break;
1831 case 'v': *p++ = '\013'; break; /* VT */
1832 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1833
1834 /* \OOO (octal) escapes */
1835 case '0': case '1': case '2': case '3':
1836 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001837 x = s[-1] - '0';
Georg Brandl1dcb9c92007-11-02 22:46:38 +00001838 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001839 x = (x<<3) + *s++ - '0';
Georg Brandl1dcb9c92007-11-02 22:46:38 +00001840 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001841 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001843 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 break;
1845
Fredrik Lundhccc74732001-02-18 22:13:49 +00001846 /* hex escapes */
1847 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001849 digits = 2;
1850 message = "truncated \\xXX escape";
1851 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852
Fredrik Lundhccc74732001-02-18 22:13:49 +00001853 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001855 digits = 4;
1856 message = "truncated \\uXXXX escape";
1857 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858
Fredrik Lundhccc74732001-02-18 22:13:49 +00001859 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001860 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001861 digits = 8;
1862 message = "truncated \\UXXXXXXXX escape";
1863 hexescape:
1864 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001865 outpos = p-PyUnicode_AS_UNICODE(v);
1866 if (s+digits>end) {
1867 endinpos = size;
1868 if (unicode_decode_call_errorhandler(
1869 errors, &errorHandler,
1870 "unicodeescape", "end of string in escape sequence",
1871 starts, size, &startinpos, &endinpos, &exc, &s,
1872 (PyObject **)&v, &outpos, &p))
1873 goto onError;
1874 goto nextByte;
1875 }
1876 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001877 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001878 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001879 endinpos = (s+i+1)-starts;
1880 if (unicode_decode_call_errorhandler(
1881 errors, &errorHandler,
1882 "unicodeescape", message,
1883 starts, size, &startinpos, &endinpos, &exc, &s,
1884 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001885 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001886 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001887 }
1888 chr = (chr<<4) & ~0xF;
1889 if (c >= '0' && c <= '9')
1890 chr += c - '0';
1891 else if (c >= 'a' && c <= 'f')
1892 chr += 10 + c - 'a';
1893 else
1894 chr += 10 + c - 'A';
1895 }
1896 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001897 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001898 /* _decoding_error will have already written into the
1899 target buffer. */
1900 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001901 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001902 /* when we get here, chr is a 32-bit unicode character */
1903 if (chr <= 0xffff)
1904 /* UCS-2 character */
1905 *p++ = (Py_UNICODE) chr;
1906 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001907 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001908 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001909#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001910 *p++ = chr;
1911#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001912 chr -= 0x10000L;
1913 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001914 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001915#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001916 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001917 endinpos = s-starts;
1918 outpos = p-PyUnicode_AS_UNICODE(v);
1919 if (unicode_decode_call_errorhandler(
1920 errors, &errorHandler,
1921 "unicodeescape", "illegal Unicode character",
1922 starts, size, &startinpos, &endinpos, &exc, &s,
1923 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001924 goto onError;
1925 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001926 break;
1927
1928 /* \N{name} */
1929 case 'N':
1930 message = "malformed \\N character escape";
1931 if (ucnhash_CAPI == NULL) {
1932 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001933 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001934 m = PyImport_ImportModule("unicodedata");
1935 if (m == NULL)
1936 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001937 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001938 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001939 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001940 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001941 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001942 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001943 if (ucnhash_CAPI == NULL)
1944 goto ucnhashError;
1945 }
1946 if (*s == '{') {
1947 const char *start = s+1;
1948 /* look for the closing brace */
1949 while (*s != '}' && s < end)
1950 s++;
1951 if (s > start && s < end && *s == '}') {
1952 /* found a name. look it up in the unicode database */
1953 message = "unknown Unicode character name";
1954 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001955 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001956 goto store;
1957 }
1958 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001959 endinpos = s-starts;
1960 outpos = p-PyUnicode_AS_UNICODE(v);
1961 if (unicode_decode_call_errorhandler(
1962 errors, &errorHandler,
1963 "unicodeescape", message,
1964 starts, size, &startinpos, &endinpos, &exc, &s,
1965 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001966 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001967 break;
1968
1969 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001970 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001971 message = "\\ at end of string";
1972 s--;
1973 endinpos = s-starts;
1974 outpos = p-PyUnicode_AS_UNICODE(v);
1975 if (unicode_decode_call_errorhandler(
1976 errors, &errorHandler,
1977 "unicodeescape", message,
1978 starts, size, &startinpos, &endinpos, &exc, &s,
1979 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001980 goto onError;
1981 }
1982 else {
1983 *p++ = '\\';
1984 *p++ = (unsigned char)s[-1];
1985 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001986 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001988 nextByte:
1989 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001991 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001992 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001993 Py_XDECREF(errorHandler);
1994 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001996
Fredrik Lundhccc74732001-02-18 22:13:49 +00001997ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001998 PyErr_SetString(
1999 PyExc_UnicodeError,
2000 "\\N escapes not supported (can't load unicodedata module)"
2001 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002002 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002003 Py_XDECREF(errorHandler);
2004 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002005 return NULL;
2006
Fredrik Lundhccc74732001-02-18 22:13:49 +00002007onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002009 Py_XDECREF(errorHandler);
2010 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011 return NULL;
2012}
2013
2014/* Return a Unicode-Escape string version of the Unicode object.
2015
2016 If quotes is true, the string is enclosed in u"" or u'' quotes as
2017 appropriate.
2018
2019*/
2020
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002021Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002022 Py_ssize_t size,
2023 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002024{
2025 /* like wcschr, but doesn't stop at NULL characters */
2026
2027 while (size-- > 0) {
2028 if (*s == ch)
2029 return s;
2030 s++;
2031 }
2032
2033 return NULL;
2034}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002035
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036static
2037PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002038 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039 int quotes)
2040{
2041 PyObject *repr;
2042 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002044 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002046 /* Initial allocation is based on the longest-possible unichr
2047 escape.
2048
2049 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2050 unichr, so in this case it's the longest unichr escape. In
2051 narrow (UTF-16) builds this is five chars per source unichr
2052 since there are two unichrs in the surrogate pair, so in narrow
2053 (UTF-16) builds it's not the longest unichr escape.
2054
2055 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2056 so in the narrow (UTF-16) build case it's the longest unichr
2057 escape.
2058 */
2059
2060 repr = PyString_FromStringAndSize(NULL,
2061 2
2062#ifdef Py_UNICODE_WIDE
2063 + 10*size
2064#else
2065 + 6*size
2066#endif
2067 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 if (repr == NULL)
2069 return NULL;
2070
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002071 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072
2073 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002075 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076 !findchar(s, size, '"')) ? '"' : '\'';
2077 }
2078 while (size-- > 0) {
2079 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002080
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002081 /* Escape quotes and backslashes */
2082 if ((quotes &&
2083 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084 *p++ = '\\';
2085 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002086 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002087 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002088
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002089#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002090 /* Map 21-bit characters to '\U00xxxxxx' */
2091 else if (ch >= 0x10000) {
2092 *p++ = '\\';
2093 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002094 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2095 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2096 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2097 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2098 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2099 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2100 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002101 *p++ = hexdigit[ch & 0x0000000F];
2102 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002103 }
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002104#else
2105 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002106 else if (ch >= 0xD800 && ch < 0xDC00) {
2107 Py_UNICODE ch2;
2108 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002109
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002110 ch2 = *s++;
2111 size--;
2112 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2113 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2114 *p++ = '\\';
2115 *p++ = 'U';
2116 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2117 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2118 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2119 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2120 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2121 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2122 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2123 *p++ = hexdigit[ucs & 0x0000000F];
2124 continue;
2125 }
2126 /* Fall through: isolated surrogates are copied as-is */
2127 s--;
2128 size++;
2129 }
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002130#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002131
Guido van Rossumd57fd912000-03-10 22:53:23 +00002132 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002133 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134 *p++ = '\\';
2135 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002136 *p++ = hexdigit[(ch >> 12) & 0x000F];
2137 *p++ = hexdigit[(ch >> 8) & 0x000F];
2138 *p++ = hexdigit[(ch >> 4) & 0x000F];
2139 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002141
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002142 /* Map special whitespace to '\t', \n', '\r' */
2143 else if (ch == '\t') {
2144 *p++ = '\\';
2145 *p++ = 't';
2146 }
2147 else if (ch == '\n') {
2148 *p++ = '\\';
2149 *p++ = 'n';
2150 }
2151 else if (ch == '\r') {
2152 *p++ = '\\';
2153 *p++ = 'r';
2154 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002155
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002156 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002157 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002158 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002159 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002160 *p++ = hexdigit[(ch >> 4) & 0x000F];
2161 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002162 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002163
Guido van Rossumd57fd912000-03-10 22:53:23 +00002164 /* Copy everything else as-is */
2165 else
2166 *p++ = (char) ch;
2167 }
2168 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002169 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170
2171 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002172 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002173 return repr;
2174}
2175
2176PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002177 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002178{
2179 return unicodeescape_string(s, size, 0);
2180}
2181
2182PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2183{
2184 if (!PyUnicode_Check(unicode)) {
2185 PyErr_BadArgument();
2186 return NULL;
2187 }
2188 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2189 PyUnicode_GET_SIZE(unicode));
2190}
2191
2192/* --- Raw Unicode Escape Codec ------------------------------------------- */
2193
2194PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002195 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196 const char *errors)
2197{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002198 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002199 Py_ssize_t startinpos;
2200 Py_ssize_t endinpos;
2201 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002202 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002203 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204 const char *end;
2205 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002206 PyObject *errorHandler = NULL;
2207 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002208
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209 /* Escaped strings will always be longer than the resulting
2210 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002211 length after conversion to the true value. (But decoding error
2212 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213 v = _PyUnicode_New(size);
2214 if (v == NULL)
2215 goto onError;
2216 if (size == 0)
2217 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002218 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002219 end = s + size;
2220 while (s < end) {
2221 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002222 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002224 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002225
2226 /* Non-escape characters are interpreted as Unicode ordinals */
2227 if (*s != '\\') {
2228 *p++ = (unsigned char)*s++;
2229 continue;
2230 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002231 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232
2233 /* \u-escapes are only interpreted iff the number of leading
2234 backslashes if odd */
2235 bs = s;
2236 for (;s < end;) {
2237 if (*s != '\\')
2238 break;
2239 *p++ = (unsigned char)*s++;
2240 }
2241 if (((s - bs) & 1) == 0 ||
2242 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002243 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244 continue;
2245 }
2246 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002247 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 s++;
2249
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002250 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002251 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002252 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002253 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002254 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002255 endinpos = s-starts;
2256 if (unicode_decode_call_errorhandler(
2257 errors, &errorHandler,
2258 "rawunicodeescape", "truncated \\uXXXX",
2259 starts, size, &startinpos, &endinpos, &exc, &s,
2260 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002262 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263 }
2264 x = (x<<4) & ~0xF;
2265 if (c >= '0' && c <= '9')
2266 x += c - '0';
2267 else if (c >= 'a' && c <= 'f')
2268 x += 10 + c - 'a';
2269 else
2270 x += 10 + c - 'A';
2271 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002272#ifndef Py_UNICODE_WIDE
2273 if (x > 0x10000) {
2274 if (unicode_decode_call_errorhandler(
2275 errors, &errorHandler,
2276 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2277 starts, size, &startinpos, &endinpos, &exc, &s,
2278 (PyObject **)&v, &outpos, &p))
2279 goto onError;
2280 }
2281#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002282 *p++ = x;
2283 nextByte:
2284 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002286 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002287 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002288 Py_XDECREF(errorHandler);
2289 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002290 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002291
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292 onError:
2293 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002294 Py_XDECREF(errorHandler);
2295 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296 return NULL;
2297}
2298
2299PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002300 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301{
2302 PyObject *repr;
2303 char *p;
2304 char *q;
2305
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002306 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002308#ifdef Py_UNICODE_WIDE
2309 repr = PyString_FromStringAndSize(NULL, 10 * size);
2310#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002311 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002312#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002313 if (repr == NULL)
2314 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002315 if (size == 0)
2316 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002317
2318 p = q = PyString_AS_STRING(repr);
2319 while (size-- > 0) {
2320 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002321#ifdef Py_UNICODE_WIDE
2322 /* Map 32-bit characters to '\Uxxxxxxxx' */
2323 if (ch >= 0x10000) {
2324 *p++ = '\\';
2325 *p++ = 'U';
2326 *p++ = hexdigit[(ch >> 28) & 0xf];
2327 *p++ = hexdigit[(ch >> 24) & 0xf];
2328 *p++ = hexdigit[(ch >> 20) & 0xf];
2329 *p++ = hexdigit[(ch >> 16) & 0xf];
2330 *p++ = hexdigit[(ch >> 12) & 0xf];
2331 *p++ = hexdigit[(ch >> 8) & 0xf];
2332 *p++ = hexdigit[(ch >> 4) & 0xf];
2333 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002334 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002335 else
2336#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337 /* Map 16-bit characters to '\uxxxx' */
2338 if (ch >= 256) {
2339 *p++ = '\\';
2340 *p++ = 'u';
2341 *p++ = hexdigit[(ch >> 12) & 0xf];
2342 *p++ = hexdigit[(ch >> 8) & 0xf];
2343 *p++ = hexdigit[(ch >> 4) & 0xf];
2344 *p++ = hexdigit[ch & 15];
2345 }
2346 /* Copy everything else as-is */
2347 else
2348 *p++ = (char) ch;
2349 }
2350 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002351 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002352 return repr;
2353}
2354
2355PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2356{
2357 if (!PyUnicode_Check(unicode)) {
2358 PyErr_BadArgument();
2359 return NULL;
2360 }
2361 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2362 PyUnicode_GET_SIZE(unicode));
2363}
2364
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002365/* --- Unicode Internal Codec ------------------------------------------- */
2366
2367PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002368 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002369 const char *errors)
2370{
2371 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002372 Py_ssize_t startinpos;
2373 Py_ssize_t endinpos;
2374 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002375 PyUnicodeObject *v;
2376 Py_UNICODE *p;
2377 const char *end;
2378 const char *reason;
2379 PyObject *errorHandler = NULL;
2380 PyObject *exc = NULL;
2381
Neal Norwitzd43069c2006-01-08 01:12:10 +00002382#ifdef Py_UNICODE_WIDE
2383 Py_UNICODE unimax = PyUnicode_GetMax();
2384#endif
2385
Armin Rigo4b63c212006-10-04 11:44:06 +00002386 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002387 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2388 if (v == NULL)
2389 goto onError;
2390 if (PyUnicode_GetSize((PyObject *)v) == 0)
2391 return (PyObject *)v;
2392 p = PyUnicode_AS_UNICODE(v);
2393 end = s + size;
2394
2395 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002396 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002397 /* We have to sanity check the raw data, otherwise doom looms for
2398 some malformed UCS-4 data. */
2399 if (
2400 #ifdef Py_UNICODE_WIDE
2401 *p > unimax || *p < 0 ||
2402 #endif
2403 end-s < Py_UNICODE_SIZE
2404 )
2405 {
2406 startinpos = s - starts;
2407 if (end-s < Py_UNICODE_SIZE) {
2408 endinpos = end-starts;
2409 reason = "truncated input";
2410 }
2411 else {
2412 endinpos = s - starts + Py_UNICODE_SIZE;
2413 reason = "illegal code point (> 0x10FFFF)";
2414 }
2415 outpos = p - PyUnicode_AS_UNICODE(v);
2416 if (unicode_decode_call_errorhandler(
2417 errors, &errorHandler,
2418 "unicode_internal", reason,
2419 starts, size, &startinpos, &endinpos, &exc, &s,
2420 (PyObject **)&v, &outpos, &p)) {
2421 goto onError;
2422 }
2423 }
2424 else {
2425 p++;
2426 s += Py_UNICODE_SIZE;
2427 }
2428 }
2429
Martin v. Löwis412fb672006-04-13 06:34:32 +00002430 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002431 goto onError;
2432 Py_XDECREF(errorHandler);
2433 Py_XDECREF(exc);
2434 return (PyObject *)v;
2435
2436 onError:
2437 Py_XDECREF(v);
2438 Py_XDECREF(errorHandler);
2439 Py_XDECREF(exc);
2440 return NULL;
2441}
2442
Guido van Rossumd57fd912000-03-10 22:53:23 +00002443/* --- Latin-1 Codec ------------------------------------------------------ */
2444
2445PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002446 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447 const char *errors)
2448{
2449 PyUnicodeObject *v;
2450 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002451
Guido van Rossumd57fd912000-03-10 22:53:23 +00002452 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002453 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002454 Py_UNICODE r = *(unsigned char*)s;
2455 return PyUnicode_FromUnicode(&r, 1);
2456 }
2457
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458 v = _PyUnicode_New(size);
2459 if (v == NULL)
2460 goto onError;
2461 if (size == 0)
2462 return (PyObject *)v;
2463 p = PyUnicode_AS_UNICODE(v);
2464 while (size-- > 0)
2465 *p++ = (unsigned char)*s++;
2466 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002467
Guido van Rossumd57fd912000-03-10 22:53:23 +00002468 onError:
2469 Py_XDECREF(v);
2470 return NULL;
2471}
2472
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002473/* create or adjust a UnicodeEncodeError */
2474static void make_encode_exception(PyObject **exceptionObject,
2475 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002476 const Py_UNICODE *unicode, Py_ssize_t size,
2477 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002478 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002480 if (*exceptionObject == NULL) {
2481 *exceptionObject = PyUnicodeEncodeError_Create(
2482 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483 }
2484 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002485 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2486 goto onError;
2487 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2488 goto onError;
2489 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2490 goto onError;
2491 return;
2492 onError:
2493 Py_DECREF(*exceptionObject);
2494 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495 }
2496}
2497
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002498/* raises a UnicodeEncodeError */
2499static void raise_encode_exception(PyObject **exceptionObject,
2500 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002501 const Py_UNICODE *unicode, Py_ssize_t size,
2502 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002503 const char *reason)
2504{
2505 make_encode_exception(exceptionObject,
2506 encoding, unicode, size, startpos, endpos, reason);
2507 if (*exceptionObject != NULL)
2508 PyCodec_StrictErrors(*exceptionObject);
2509}
2510
2511/* error handling callback helper:
2512 build arguments, call the callback and check the arguments,
2513 put the result into newpos and return the replacement string, which
2514 has to be freed by the caller */
2515static PyObject *unicode_encode_call_errorhandler(const char *errors,
2516 PyObject **errorHandler,
2517 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002518 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2519 Py_ssize_t startpos, Py_ssize_t endpos,
2520 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002521{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002522 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002523
2524 PyObject *restuple;
2525 PyObject *resunicode;
2526
2527 if (*errorHandler == NULL) {
2528 *errorHandler = PyCodec_LookupError(errors);
2529 if (*errorHandler == NULL)
2530 return NULL;
2531 }
2532
2533 make_encode_exception(exceptionObject,
2534 encoding, unicode, size, startpos, endpos, reason);
2535 if (*exceptionObject == NULL)
2536 return NULL;
2537
2538 restuple = PyObject_CallFunctionObjArgs(
2539 *errorHandler, *exceptionObject, NULL);
2540 if (restuple == NULL)
2541 return NULL;
2542 if (!PyTuple_Check(restuple)) {
2543 PyErr_Format(PyExc_TypeError, &argparse[4]);
2544 Py_DECREF(restuple);
2545 return NULL;
2546 }
2547 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2548 &resunicode, newpos)) {
2549 Py_DECREF(restuple);
2550 return NULL;
2551 }
2552 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002553 *newpos = size+*newpos;
2554 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002555 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002556 Py_DECREF(restuple);
2557 return NULL;
2558 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002559 Py_INCREF(resunicode);
2560 Py_DECREF(restuple);
2561 return resunicode;
2562}
2563
2564static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002565 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002566 const char *errors,
2567 int limit)
2568{
2569 /* output object */
2570 PyObject *res;
2571 /* pointers to the beginning and end+1 of input */
2572 const Py_UNICODE *startp = p;
2573 const Py_UNICODE *endp = p + size;
2574 /* pointer to the beginning of the unencodable characters */
2575 /* const Py_UNICODE *badp = NULL; */
2576 /* pointer into the output */
2577 char *str;
2578 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002579 Py_ssize_t respos = 0;
2580 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002581 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2582 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002583 PyObject *errorHandler = NULL;
2584 PyObject *exc = NULL;
2585 /* the following variable is used for caching string comparisons
2586 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2587 int known_errorHandler = -1;
2588
2589 /* allocate enough for a simple encoding without
2590 replacements, if we need more, we'll resize */
2591 res = PyString_FromStringAndSize(NULL, size);
2592 if (res == NULL)
2593 goto onError;
2594 if (size == 0)
2595 return res;
2596 str = PyString_AS_STRING(res);
2597 ressize = size;
2598
2599 while (p<endp) {
2600 Py_UNICODE c = *p;
2601
2602 /* can we encode this? */
2603 if (c<limit) {
2604 /* no overflow check, because we know that the space is enough */
2605 *str++ = (char)c;
2606 ++p;
2607 }
2608 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002609 Py_ssize_t unicodepos = p-startp;
2610 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002611 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002612 Py_ssize_t repsize;
2613 Py_ssize_t newpos;
2614 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002615 Py_UNICODE *uni2;
2616 /* startpos for collecting unencodable chars */
2617 const Py_UNICODE *collstart = p;
2618 const Py_UNICODE *collend = p;
2619 /* find all unecodable characters */
2620 while ((collend < endp) && ((*collend)>=limit))
2621 ++collend;
2622 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2623 if (known_errorHandler==-1) {
2624 if ((errors==NULL) || (!strcmp(errors, "strict")))
2625 known_errorHandler = 1;
2626 else if (!strcmp(errors, "replace"))
2627 known_errorHandler = 2;
2628 else if (!strcmp(errors, "ignore"))
2629 known_errorHandler = 3;
2630 else if (!strcmp(errors, "xmlcharrefreplace"))
2631 known_errorHandler = 4;
2632 else
2633 known_errorHandler = 0;
2634 }
2635 switch (known_errorHandler) {
2636 case 1: /* strict */
2637 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2638 goto onError;
2639 case 2: /* replace */
2640 while (collstart++<collend)
2641 *str++ = '?'; /* fall through */
2642 case 3: /* ignore */
2643 p = collend;
2644 break;
2645 case 4: /* xmlcharrefreplace */
2646 respos = str-PyString_AS_STRING(res);
2647 /* determine replacement size (temporarily (mis)uses p) */
2648 for (p = collstart, repsize = 0; p < collend; ++p) {
2649 if (*p<10)
2650 repsize += 2+1+1;
2651 else if (*p<100)
2652 repsize += 2+2+1;
2653 else if (*p<1000)
2654 repsize += 2+3+1;
2655 else if (*p<10000)
2656 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002657#ifndef Py_UNICODE_WIDE
2658 else
2659 repsize += 2+5+1;
2660#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002661 else if (*p<100000)
2662 repsize += 2+5+1;
2663 else if (*p<1000000)
2664 repsize += 2+6+1;
2665 else
2666 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002667#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002668 }
2669 requiredsize = respos+repsize+(endp-collend);
2670 if (requiredsize > ressize) {
2671 if (requiredsize<2*ressize)
2672 requiredsize = 2*ressize;
2673 if (_PyString_Resize(&res, requiredsize))
2674 goto onError;
2675 str = PyString_AS_STRING(res) + respos;
2676 ressize = requiredsize;
2677 }
2678 /* generate replacement (temporarily (mis)uses p) */
2679 for (p = collstart; p < collend; ++p) {
2680 str += sprintf(str, "&#%d;", (int)*p);
2681 }
2682 p = collend;
2683 break;
2684 default:
2685 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2686 encoding, reason, startp, size, &exc,
2687 collstart-startp, collend-startp, &newpos);
2688 if (repunicode == NULL)
2689 goto onError;
2690 /* need more space? (at least enough for what we
2691 have+the replacement+the rest of the string, so
2692 we won't have to check space for encodable characters) */
2693 respos = str-PyString_AS_STRING(res);
2694 repsize = PyUnicode_GET_SIZE(repunicode);
2695 requiredsize = respos+repsize+(endp-collend);
2696 if (requiredsize > ressize) {
2697 if (requiredsize<2*ressize)
2698 requiredsize = 2*ressize;
2699 if (_PyString_Resize(&res, requiredsize)) {
2700 Py_DECREF(repunicode);
2701 goto onError;
2702 }
2703 str = PyString_AS_STRING(res) + respos;
2704 ressize = requiredsize;
2705 }
2706 /* check if there is anything unencodable in the replacement
2707 and copy it to the output */
2708 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2709 c = *uni2;
2710 if (c >= limit) {
2711 raise_encode_exception(&exc, encoding, startp, size,
2712 unicodepos, unicodepos+1, reason);
2713 Py_DECREF(repunicode);
2714 goto onError;
2715 }
2716 *str = (char)c;
2717 }
2718 p = startp + newpos;
2719 Py_DECREF(repunicode);
2720 }
2721 }
2722 }
2723 /* Resize if we allocated to much */
2724 respos = str-PyString_AS_STRING(res);
2725 if (respos<ressize)
2726 /* If this falls res will be NULL */
2727 _PyString_Resize(&res, respos);
2728 Py_XDECREF(errorHandler);
2729 Py_XDECREF(exc);
2730 return res;
2731
2732 onError:
2733 Py_XDECREF(res);
2734 Py_XDECREF(errorHandler);
2735 Py_XDECREF(exc);
2736 return NULL;
2737}
2738
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002740 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 const char *errors)
2742{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002743 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744}
2745
2746PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2747{
2748 if (!PyUnicode_Check(unicode)) {
2749 PyErr_BadArgument();
2750 return NULL;
2751 }
2752 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2753 PyUnicode_GET_SIZE(unicode),
2754 NULL);
2755}
2756
2757/* --- 7-bit ASCII Codec -------------------------------------------------- */
2758
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002760 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 const char *errors)
2762{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002763 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 PyUnicodeObject *v;
2765 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002766 Py_ssize_t startinpos;
2767 Py_ssize_t endinpos;
2768 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002769 const char *e;
2770 PyObject *errorHandler = NULL;
2771 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002772
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002774 if (size == 1 && *(unsigned char*)s < 128) {
2775 Py_UNICODE r = *(unsigned char*)s;
2776 return PyUnicode_FromUnicode(&r, 1);
2777 }
Tim Petersced69f82003-09-16 20:30:58 +00002778
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779 v = _PyUnicode_New(size);
2780 if (v == NULL)
2781 goto onError;
2782 if (size == 0)
2783 return (PyObject *)v;
2784 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002785 e = s + size;
2786 while (s < e) {
2787 register unsigned char c = (unsigned char)*s;
2788 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002790 ++s;
2791 }
2792 else {
2793 startinpos = s-starts;
2794 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002795 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002796 if (unicode_decode_call_errorhandler(
2797 errors, &errorHandler,
2798 "ascii", "ordinal not in range(128)",
2799 starts, size, &startinpos, &endinpos, &exc, &s,
2800 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002802 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002804 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002805 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002806 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002807 Py_XDECREF(errorHandler);
2808 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002810
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 onError:
2812 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002813 Py_XDECREF(errorHandler);
2814 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 return NULL;
2816}
2817
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002819 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002820 const char *errors)
2821{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823}
2824
2825PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2826{
2827 if (!PyUnicode_Check(unicode)) {
2828 PyErr_BadArgument();
2829 return NULL;
2830 }
2831 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2832 PyUnicode_GET_SIZE(unicode),
2833 NULL);
2834}
2835
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002836#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002837
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002838/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002839
Martin v. Löwisd8251432006-06-14 05:21:04 +00002840#if SIZEOF_INT < SIZEOF_SSIZE_T
2841#define NEED_RETRY
2842#endif
2843
2844/* XXX This code is limited to "true" double-byte encodings, as
2845 a) it assumes an incomplete character consists of a single byte, and
2846 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2847 encodings, see IsDBCSLeadByteEx documentation. */
2848
2849static int is_dbcs_lead_byte(const char *s, int offset)
2850{
2851 const char *curr = s + offset;
2852
2853 if (IsDBCSLeadByte(*curr)) {
2854 const char *prev = CharPrev(s, curr);
2855 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2856 }
2857 return 0;
2858}
2859
2860/*
2861 * Decode MBCS string into unicode object. If 'final' is set, converts
2862 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2863 */
2864static int decode_mbcs(PyUnicodeObject **v,
2865 const char *s, /* MBCS string */
2866 int size, /* sizeof MBCS string */
2867 int final)
2868{
2869 Py_UNICODE *p;
2870 Py_ssize_t n = 0;
2871 int usize = 0;
2872
2873 assert(size >= 0);
2874
2875 /* Skip trailing lead-byte unless 'final' is set */
2876 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2877 --size;
2878
2879 /* First get the size of the result */
2880 if (size > 0) {
2881 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2882 if (usize == 0) {
2883 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2884 return -1;
2885 }
2886 }
2887
2888 if (*v == NULL) {
2889 /* Create unicode object */
2890 *v = _PyUnicode_New(usize);
2891 if (*v == NULL)
2892 return -1;
2893 }
2894 else {
2895 /* Extend unicode object */
2896 n = PyUnicode_GET_SIZE(*v);
2897 if (_PyUnicode_Resize(v, n + usize) < 0)
2898 return -1;
2899 }
2900
2901 /* Do the conversion */
2902 if (size > 0) {
2903 p = PyUnicode_AS_UNICODE(*v) + n;
2904 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2905 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2906 return -1;
2907 }
2908 }
2909
2910 return size;
2911}
2912
2913PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2914 Py_ssize_t size,
2915 const char *errors,
2916 Py_ssize_t *consumed)
2917{
2918 PyUnicodeObject *v = NULL;
2919 int done;
2920
2921 if (consumed)
2922 *consumed = 0;
2923
2924#ifdef NEED_RETRY
2925 retry:
2926 if (size > INT_MAX)
2927 done = decode_mbcs(&v, s, INT_MAX, 0);
2928 else
2929#endif
2930 done = decode_mbcs(&v, s, (int)size, !consumed);
2931
2932 if (done < 0) {
2933 Py_XDECREF(v);
2934 return NULL;
2935 }
2936
2937 if (consumed)
2938 *consumed += done;
2939
2940#ifdef NEED_RETRY
2941 if (size > INT_MAX) {
2942 s += done;
2943 size -= done;
2944 goto retry;
2945 }
2946#endif
2947
2948 return (PyObject *)v;
2949}
2950
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002951PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002952 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002953 const char *errors)
2954{
Martin v. Löwisd8251432006-06-14 05:21:04 +00002955 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2956}
2957
2958/*
2959 * Convert unicode into string object (MBCS).
2960 * Returns 0 if succeed, -1 otherwise.
2961 */
2962static int encode_mbcs(PyObject **repr,
2963 const Py_UNICODE *p, /* unicode */
2964 int size) /* size of unicode */
2965{
2966 int mbcssize = 0;
2967 Py_ssize_t n = 0;
2968
2969 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002970
2971 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00002972 if (size > 0) {
2973 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2974 if (mbcssize == 0) {
2975 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2976 return -1;
2977 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002978 }
2979
Martin v. Löwisd8251432006-06-14 05:21:04 +00002980 if (*repr == NULL) {
2981 /* Create string object */
2982 *repr = PyString_FromStringAndSize(NULL, mbcssize);
2983 if (*repr == NULL)
2984 return -1;
2985 }
2986 else {
2987 /* Extend string object */
2988 n = PyString_Size(*repr);
2989 if (_PyString_Resize(repr, n + mbcssize) < 0)
2990 return -1;
2991 }
2992
2993 /* Do the conversion */
2994 if (size > 0) {
2995 char *s = PyString_AS_STRING(*repr) + n;
2996 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2997 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2998 return -1;
2999 }
3000 }
3001
3002 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003003}
3004
3005PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003006 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003007 const char *errors)
3008{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003009 PyObject *repr = NULL;
3010 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003011
Martin v. Löwisd8251432006-06-14 05:21:04 +00003012#ifdef NEED_RETRY
3013 retry:
3014 if (size > INT_MAX)
3015 ret = encode_mbcs(&repr, p, INT_MAX);
3016 else
3017#endif
3018 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003019
Martin v. Löwisd8251432006-06-14 05:21:04 +00003020 if (ret < 0) {
3021 Py_XDECREF(repr);
3022 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003023 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003024
3025#ifdef NEED_RETRY
3026 if (size > INT_MAX) {
3027 p += INT_MAX;
3028 size -= INT_MAX;
3029 goto retry;
3030 }
3031#endif
3032
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003033 return repr;
3034}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003035
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003036PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3037{
3038 if (!PyUnicode_Check(unicode)) {
3039 PyErr_BadArgument();
3040 return NULL;
3041 }
3042 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3043 PyUnicode_GET_SIZE(unicode),
3044 NULL);
3045}
3046
Martin v. Löwisd8251432006-06-14 05:21:04 +00003047#undef NEED_RETRY
3048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003049#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003050
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051/* --- Character Mapping Codec -------------------------------------------- */
3052
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003054 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055 PyObject *mapping,
3056 const char *errors)
3057{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003058 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003059 Py_ssize_t startinpos;
3060 Py_ssize_t endinpos;
3061 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003062 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 PyUnicodeObject *v;
3064 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003065 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003066 PyObject *errorHandler = NULL;
3067 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003068 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003069 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003070
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071 /* Default to Latin-1 */
3072 if (mapping == NULL)
3073 return PyUnicode_DecodeLatin1(s, size, errors);
3074
3075 v = _PyUnicode_New(size);
3076 if (v == NULL)
3077 goto onError;
3078 if (size == 0)
3079 return (PyObject *)v;
3080 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003081 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003082 if (PyUnicode_CheckExact(mapping)) {
3083 mapstring = PyUnicode_AS_UNICODE(mapping);
3084 maplen = PyUnicode_GET_SIZE(mapping);
3085 while (s < e) {
3086 unsigned char ch = *s;
3087 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003089 if (ch < maplen)
3090 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003092 if (x == 0xfffe) {
3093 /* undefined mapping */
3094 outpos = p-PyUnicode_AS_UNICODE(v);
3095 startinpos = s-starts;
3096 endinpos = startinpos+1;
3097 if (unicode_decode_call_errorhandler(
3098 errors, &errorHandler,
3099 "charmap", "character maps to <undefined>",
3100 starts, size, &startinpos, &endinpos, &exc, &s,
3101 (PyObject **)&v, &outpos, &p)) {
3102 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003103 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003104 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003105 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003106 *p++ = x;
3107 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003109 }
3110 else {
3111 while (s < e) {
3112 unsigned char ch = *s;
3113 PyObject *w, *x;
3114
3115 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3116 w = PyInt_FromLong((long)ch);
3117 if (w == NULL)
3118 goto onError;
3119 x = PyObject_GetItem(mapping, w);
3120 Py_DECREF(w);
3121 if (x == NULL) {
3122 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3123 /* No mapping found means: mapping is undefined. */
3124 PyErr_Clear();
3125 x = Py_None;
3126 Py_INCREF(x);
3127 } else
3128 goto onError;
3129 }
3130
3131 /* Apply mapping */
3132 if (PyInt_Check(x)) {
3133 long value = PyInt_AS_LONG(x);
3134 if (value < 0 || value > 65535) {
3135 PyErr_SetString(PyExc_TypeError,
3136 "character mapping must be in range(65536)");
3137 Py_DECREF(x);
3138 goto onError;
3139 }
3140 *p++ = (Py_UNICODE)value;
3141 }
3142 else if (x == Py_None) {
3143 /* undefined mapping */
3144 outpos = p-PyUnicode_AS_UNICODE(v);
3145 startinpos = s-starts;
3146 endinpos = startinpos+1;
3147 if (unicode_decode_call_errorhandler(
3148 errors, &errorHandler,
3149 "charmap", "character maps to <undefined>",
3150 starts, size, &startinpos, &endinpos, &exc, &s,
3151 (PyObject **)&v, &outpos, &p)) {
3152 Py_DECREF(x);
3153 goto onError;
3154 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003155 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003156 continue;
3157 }
3158 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003159 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003160
3161 if (targetsize == 1)
3162 /* 1-1 mapping */
3163 *p++ = *PyUnicode_AS_UNICODE(x);
3164
3165 else if (targetsize > 1) {
3166 /* 1-n mapping */
3167 if (targetsize > extrachars) {
3168 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003169 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3170 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003171 (targetsize << 2);
3172 extrachars += needed;
Armin Rigo4b63c212006-10-04 11:44:06 +00003173 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003174 if (_PyUnicode_Resize(&v,
3175 PyUnicode_GET_SIZE(v) + needed) < 0) {
3176 Py_DECREF(x);
3177 goto onError;
3178 }
3179 p = PyUnicode_AS_UNICODE(v) + oldpos;
3180 }
3181 Py_UNICODE_COPY(p,
3182 PyUnicode_AS_UNICODE(x),
3183 targetsize);
3184 p += targetsize;
3185 extrachars -= targetsize;
3186 }
3187 /* 1-0 mapping: skip the character */
3188 }
3189 else {
3190 /* wrong return value */
3191 PyErr_SetString(PyExc_TypeError,
3192 "character mapping must return integer, None or unicode");
3193 Py_DECREF(x);
3194 goto onError;
3195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003197 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 }
3200 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003201 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003203 Py_XDECREF(errorHandler);
3204 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003206
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003208 Py_XDECREF(errorHandler);
3209 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 Py_XDECREF(v);
3211 return NULL;
3212}
3213
Martin v. Löwis3f767792006-06-04 19:36:28 +00003214/* Charmap encoding: the lookup table */
3215
3216struct encoding_map{
3217 PyObject_HEAD
3218 unsigned char level1[32];
3219 int count2, count3;
3220 unsigned char level23[1];
3221};
3222
3223static PyObject*
3224encoding_map_size(PyObject *obj, PyObject* args)
3225{
3226 struct encoding_map *map = (struct encoding_map*)obj;
3227 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3228 128*map->count3);
3229}
3230
3231static PyMethodDef encoding_map_methods[] = {
3232 {"size", encoding_map_size, METH_NOARGS,
3233 PyDoc_STR("Return the size (in bytes) of this object") },
3234 { 0 }
3235};
3236
3237static void
3238encoding_map_dealloc(PyObject* o)
3239{
3240 PyObject_FREE(o);
3241}
3242
3243static PyTypeObject EncodingMapType = {
3244 PyObject_HEAD_INIT(NULL)
3245 0, /*ob_size*/
3246 "EncodingMap", /*tp_name*/
3247 sizeof(struct encoding_map), /*tp_basicsize*/
3248 0, /*tp_itemsize*/
3249 /* methods */
3250 encoding_map_dealloc, /*tp_dealloc*/
3251 0, /*tp_print*/
3252 0, /*tp_getattr*/
3253 0, /*tp_setattr*/
3254 0, /*tp_compare*/
3255 0, /*tp_repr*/
3256 0, /*tp_as_number*/
3257 0, /*tp_as_sequence*/
3258 0, /*tp_as_mapping*/
3259 0, /*tp_hash*/
3260 0, /*tp_call*/
3261 0, /*tp_str*/
3262 0, /*tp_getattro*/
3263 0, /*tp_setattro*/
3264 0, /*tp_as_buffer*/
3265 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3266 0, /*tp_doc*/
3267 0, /*tp_traverse*/
3268 0, /*tp_clear*/
3269 0, /*tp_richcompare*/
3270 0, /*tp_weaklistoffset*/
3271 0, /*tp_iter*/
3272 0, /*tp_iternext*/
3273 encoding_map_methods, /*tp_methods*/
3274 0, /*tp_members*/
3275 0, /*tp_getset*/
3276 0, /*tp_base*/
3277 0, /*tp_dict*/
3278 0, /*tp_descr_get*/
3279 0, /*tp_descr_set*/
3280 0, /*tp_dictoffset*/
3281 0, /*tp_init*/
3282 0, /*tp_alloc*/
3283 0, /*tp_new*/
3284 0, /*tp_free*/
3285 0, /*tp_is_gc*/
3286};
3287
3288PyObject*
3289PyUnicode_BuildEncodingMap(PyObject* string)
3290{
3291 Py_UNICODE *decode;
3292 PyObject *result;
3293 struct encoding_map *mresult;
3294 int i;
3295 int need_dict = 0;
3296 unsigned char level1[32];
3297 unsigned char level2[512];
3298 unsigned char *mlevel1, *mlevel2, *mlevel3;
3299 int count2 = 0, count3 = 0;
3300
3301 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3302 PyErr_BadArgument();
3303 return NULL;
3304 }
3305 decode = PyUnicode_AS_UNICODE(string);
3306 memset(level1, 0xFF, sizeof level1);
3307 memset(level2, 0xFF, sizeof level2);
3308
3309 /* If there isn't a one-to-one mapping of NULL to \0,
3310 or if there are non-BMP characters, we need to use
3311 a mapping dictionary. */
3312 if (decode[0] != 0)
3313 need_dict = 1;
3314 for (i = 1; i < 256; i++) {
3315 int l1, l2;
3316 if (decode[i] == 0
3317 #ifdef Py_UNICODE_WIDE
3318 || decode[i] > 0xFFFF
3319 #endif
3320 ) {
3321 need_dict = 1;
3322 break;
3323 }
3324 if (decode[i] == 0xFFFE)
3325 /* unmapped character */
3326 continue;
3327 l1 = decode[i] >> 11;
3328 l2 = decode[i] >> 7;
3329 if (level1[l1] == 0xFF)
3330 level1[l1] = count2++;
3331 if (level2[l2] == 0xFF)
3332 level2[l2] = count3++;
3333 }
3334
3335 if (count2 >= 0xFF || count3 >= 0xFF)
3336 need_dict = 1;
3337
3338 if (need_dict) {
3339 PyObject *result = PyDict_New();
3340 PyObject *key, *value;
3341 if (!result)
3342 return NULL;
3343 for (i = 0; i < 256; i++) {
3344 key = value = NULL;
3345 key = PyInt_FromLong(decode[i]);
3346 value = PyInt_FromLong(i);
3347 if (!key || !value)
3348 goto failed1;
3349 if (PyDict_SetItem(result, key, value) == -1)
3350 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00003351 Py_DECREF(key);
3352 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003353 }
3354 return result;
3355 failed1:
3356 Py_XDECREF(key);
3357 Py_XDECREF(value);
3358 Py_DECREF(result);
3359 return NULL;
3360 }
3361
3362 /* Create a three-level trie */
3363 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3364 16*count2 + 128*count3 - 1);
3365 if (!result)
3366 return PyErr_NoMemory();
3367 PyObject_Init(result, &EncodingMapType);
3368 mresult = (struct encoding_map*)result;
3369 mresult->count2 = count2;
3370 mresult->count3 = count3;
3371 mlevel1 = mresult->level1;
3372 mlevel2 = mresult->level23;
3373 mlevel3 = mresult->level23 + 16*count2;
3374 memcpy(mlevel1, level1, 32);
3375 memset(mlevel2, 0xFF, 16*count2);
3376 memset(mlevel3, 0, 128*count3);
3377 count3 = 0;
3378 for (i = 1; i < 256; i++) {
3379 int o1, o2, o3, i2, i3;
3380 if (decode[i] == 0xFFFE)
3381 /* unmapped character */
3382 continue;
3383 o1 = decode[i]>>11;
3384 o2 = (decode[i]>>7) & 0xF;
3385 i2 = 16*mlevel1[o1] + o2;
3386 if (mlevel2[i2] == 0xFF)
3387 mlevel2[i2] = count3++;
3388 o3 = decode[i] & 0x7F;
3389 i3 = 128*mlevel2[i2] + o3;
3390 mlevel3[i3] = i;
3391 }
3392 return result;
3393}
3394
3395static int
3396encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3397{
3398 struct encoding_map *map = (struct encoding_map*)mapping;
3399 int l1 = c>>11;
3400 int l2 = (c>>7) & 0xF;
3401 int l3 = c & 0x7F;
3402 int i;
3403
3404#ifdef Py_UNICODE_WIDE
3405 if (c > 0xFFFF) {
3406 return -1;
3407 }
3408#endif
3409 if (c == 0)
3410 return 0;
3411 /* level 1*/
3412 i = map->level1[l1];
3413 if (i == 0xFF) {
3414 return -1;
3415 }
3416 /* level 2*/
3417 i = map->level23[16*i+l2];
3418 if (i == 0xFF) {
3419 return -1;
3420 }
3421 /* level 3 */
3422 i = map->level23[16*map->count2 + 128*i + l3];
3423 if (i == 0) {
3424 return -1;
3425 }
3426 return i;
3427}
3428
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429/* Lookup the character ch in the mapping. If the character
3430 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003431 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003432static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003433{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003434 PyObject *w = PyInt_FromLong((long)c);
3435 PyObject *x;
3436
3437 if (w == NULL)
3438 return NULL;
3439 x = PyObject_GetItem(mapping, w);
3440 Py_DECREF(w);
3441 if (x == NULL) {
3442 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3443 /* No mapping found means: mapping is undefined. */
3444 PyErr_Clear();
3445 x = Py_None;
3446 Py_INCREF(x);
3447 return x;
3448 } else
3449 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003450 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003451 else if (x == Py_None)
3452 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003453 else if (PyInt_Check(x)) {
3454 long value = PyInt_AS_LONG(x);
3455 if (value < 0 || value > 255) {
3456 PyErr_SetString(PyExc_TypeError,
3457 "character mapping must be in range(256)");
3458 Py_DECREF(x);
3459 return NULL;
3460 }
3461 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003463 else if (PyString_Check(x))
3464 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 /* wrong return value */
3467 PyErr_SetString(PyExc_TypeError,
3468 "character mapping must return integer, None or str");
3469 Py_DECREF(x);
3470 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471 }
3472}
3473
Martin v. Löwis3f767792006-06-04 19:36:28 +00003474static int
3475charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3476{
3477 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3478 /* exponentially overallocate to minimize reallocations */
3479 if (requiredsize < 2*outsize)
3480 requiredsize = 2*outsize;
3481 if (_PyString_Resize(outobj, requiredsize)) {
3482 return 0;
3483 }
3484 return 1;
3485}
3486
3487typedef enum charmapencode_result {
3488 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3489}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003490/* lookup the character, put the result in the output string and adjust
3491 various state variables. Reallocate the output string if not enough
3492 space is available. Return a new reference to the object that
3493 was put in the output buffer, or Py_None, if the mapping was undefined
3494 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003495 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496static
Martin v. Löwis3f767792006-06-04 19:36:28 +00003497charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003498 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003499{
Martin v. Löwis3f767792006-06-04 19:36:28 +00003500 PyObject *rep;
3501 char *outstart;
3502 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503
Martin v. Löwis3f767792006-06-04 19:36:28 +00003504 if (mapping->ob_type == &EncodingMapType) {
3505 int res = encoding_map_lookup(c, mapping);
3506 Py_ssize_t requiredsize = *outpos+1;
3507 if (res == -1)
3508 return enc_FAILED;
3509 if (outsize<requiredsize)
3510 if (!charmapencode_resize(outobj, outpos, requiredsize))
3511 return enc_EXCEPTION;
3512 outstart = PyString_AS_STRING(*outobj);
3513 outstart[(*outpos)++] = (char)res;
3514 return enc_SUCCESS;
3515 }
3516
3517 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003519 return enc_EXCEPTION;
3520 else if (rep==Py_None) {
3521 Py_DECREF(rep);
3522 return enc_FAILED;
3523 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003524 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003525 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003526 if (outsize<requiredsize)
3527 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003528 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003529 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003531 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3533 }
3534 else {
3535 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003536 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3537 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003538 if (outsize<requiredsize)
3539 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003541 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003543 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544 memcpy(outstart + *outpos, repchars, repsize);
3545 *outpos += repsize;
3546 }
3547 }
Georg Brandl9f167602006-06-04 21:46:16 +00003548 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003549 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003550}
3551
3552/* handle an error in PyUnicode_EncodeCharmap
3553 Return 0 on success, -1 on error */
3554static
3555int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003556 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003558 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003559 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560{
3561 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003562 Py_ssize_t repsize;
3563 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003564 Py_UNICODE *uni2;
3565 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003566 Py_ssize_t collstartpos = *inpos;
3567 Py_ssize_t collendpos = *inpos+1;
3568 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569 char *encoding = "charmap";
3570 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00003571 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573 /* find all unencodable characters */
3574 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003575 PyObject *rep;
3576 if (mapping->ob_type == &EncodingMapType) {
3577 int res = encoding_map_lookup(p[collendpos], mapping);
3578 if (res != -1)
3579 break;
3580 ++collendpos;
3581 continue;
3582 }
3583
3584 rep = charmapencode_lookup(p[collendpos], mapping);
3585 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003586 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003587 else if (rep!=Py_None) {
3588 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003589 break;
3590 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003591 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592 ++collendpos;
3593 }
3594 /* cache callback name lookup
3595 * (if not done yet, i.e. it's the first error) */
3596 if (*known_errorHandler==-1) {
3597 if ((errors==NULL) || (!strcmp(errors, "strict")))
3598 *known_errorHandler = 1;
3599 else if (!strcmp(errors, "replace"))
3600 *known_errorHandler = 2;
3601 else if (!strcmp(errors, "ignore"))
3602 *known_errorHandler = 3;
3603 else if (!strcmp(errors, "xmlcharrefreplace"))
3604 *known_errorHandler = 4;
3605 else
3606 *known_errorHandler = 0;
3607 }
3608 switch (*known_errorHandler) {
3609 case 1: /* strict */
3610 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3611 return -1;
3612 case 2: /* replace */
3613 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3614 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003615 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003616 return -1;
3617 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003618 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003619 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3620 return -1;
3621 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 }
3623 /* fall through */
3624 case 3: /* ignore */
3625 *inpos = collendpos;
3626 break;
3627 case 4: /* xmlcharrefreplace */
3628 /* generate replacement (temporarily (mis)uses p) */
3629 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3630 char buffer[2+29+1+1];
3631 char *cp;
3632 sprintf(buffer, "&#%d;", (int)p[collpos]);
3633 for (cp = buffer; *cp; ++cp) {
3634 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003635 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003636 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003637 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003638 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3639 return -1;
3640 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003641 }
3642 }
3643 *inpos = collendpos;
3644 break;
3645 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003646 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003647 encoding, reason, p, size, exceptionObject,
3648 collstartpos, collendpos, &newpos);
3649 if (repunicode == NULL)
3650 return -1;
3651 /* generate replacement */
3652 repsize = PyUnicode_GET_SIZE(repunicode);
3653 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3654 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003655 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003656 return -1;
3657 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003658 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003659 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3661 return -1;
3662 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003663 }
3664 *inpos = newpos;
3665 Py_DECREF(repunicode);
3666 }
3667 return 0;
3668}
3669
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003671 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003672 PyObject *mapping,
3673 const char *errors)
3674{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 /* output object */
3676 PyObject *res = NULL;
3677 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003678 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003679 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003680 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003681 PyObject *errorHandler = NULL;
3682 PyObject *exc = NULL;
3683 /* the following variable is used for caching string comparisons
3684 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3685 * 3=ignore, 4=xmlcharrefreplace */
3686 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687
3688 /* Default to Latin-1 */
3689 if (mapping == NULL)
3690 return PyUnicode_EncodeLatin1(p, size, errors);
3691
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 /* allocate enough for a simple encoding without
3693 replacements, if we need more, we'll resize */
3694 res = PyString_FromStringAndSize(NULL, size);
3695 if (res == NULL)
3696 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003697 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003698 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003700 while (inpos<size) {
3701 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00003702 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3703 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003705 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003706 if (charmap_encoding_error(p, size, &inpos, mapping,
3707 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003708 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003709 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003710 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003711 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003712 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003713 else
3714 /* done with this character => adjust input position */
3715 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003718 /* Resize if we allocated to much */
3719 if (respos<PyString_GET_SIZE(res)) {
3720 if (_PyString_Resize(&res, respos))
3721 goto onError;
3722 }
3723 Py_XDECREF(exc);
3724 Py_XDECREF(errorHandler);
3725 return res;
3726
3727 onError:
3728 Py_XDECREF(res);
3729 Py_XDECREF(exc);
3730 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 return NULL;
3732}
3733
3734PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3735 PyObject *mapping)
3736{
3737 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3738 PyErr_BadArgument();
3739 return NULL;
3740 }
3741 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3742 PyUnicode_GET_SIZE(unicode),
3743 mapping,
3744 NULL);
3745}
3746
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003747/* create or adjust a UnicodeTranslateError */
3748static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003749 const Py_UNICODE *unicode, Py_ssize_t size,
3750 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003751 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003753 if (*exceptionObject == NULL) {
3754 *exceptionObject = PyUnicodeTranslateError_Create(
3755 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003756 }
3757 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3759 goto onError;
3760 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3761 goto onError;
3762 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3763 goto onError;
3764 return;
3765 onError:
3766 Py_DECREF(*exceptionObject);
3767 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768 }
3769}
3770
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003771/* raises a UnicodeTranslateError */
3772static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003773 const Py_UNICODE *unicode, Py_ssize_t size,
3774 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775 const char *reason)
3776{
3777 make_translate_exception(exceptionObject,
3778 unicode, size, startpos, endpos, reason);
3779 if (*exceptionObject != NULL)
3780 PyCodec_StrictErrors(*exceptionObject);
3781}
3782
3783/* error handling callback helper:
3784 build arguments, call the callback and check the arguments,
3785 put the result into newpos and return the replacement string, which
3786 has to be freed by the caller */
3787static PyObject *unicode_translate_call_errorhandler(const char *errors,
3788 PyObject **errorHandler,
3789 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003790 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3791 Py_ssize_t startpos, Py_ssize_t endpos,
3792 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003793{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003794 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003795
Martin v. Löwis412fb672006-04-13 06:34:32 +00003796 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003797 PyObject *restuple;
3798 PyObject *resunicode;
3799
3800 if (*errorHandler == NULL) {
3801 *errorHandler = PyCodec_LookupError(errors);
3802 if (*errorHandler == NULL)
3803 return NULL;
3804 }
3805
3806 make_translate_exception(exceptionObject,
3807 unicode, size, startpos, endpos, reason);
3808 if (*exceptionObject == NULL)
3809 return NULL;
3810
3811 restuple = PyObject_CallFunctionObjArgs(
3812 *errorHandler, *exceptionObject, NULL);
3813 if (restuple == NULL)
3814 return NULL;
3815 if (!PyTuple_Check(restuple)) {
3816 PyErr_Format(PyExc_TypeError, &argparse[4]);
3817 Py_DECREF(restuple);
3818 return NULL;
3819 }
3820 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003821 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003822 Py_DECREF(restuple);
3823 return NULL;
3824 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003825 if (i_newpos<0)
3826 *newpos = size+i_newpos;
3827 else
3828 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003829 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003830 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003831 Py_DECREF(restuple);
3832 return NULL;
3833 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003834 Py_INCREF(resunicode);
3835 Py_DECREF(restuple);
3836 return resunicode;
3837}
3838
3839/* Lookup the character ch in the mapping and put the result in result,
3840 which must be decrefed by the caller.
3841 Return 0 on success, -1 on error */
3842static
3843int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3844{
3845 PyObject *w = PyInt_FromLong((long)c);
3846 PyObject *x;
3847
3848 if (w == NULL)
3849 return -1;
3850 x = PyObject_GetItem(mapping, w);
3851 Py_DECREF(w);
3852 if (x == NULL) {
3853 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3854 /* No mapping found means: use 1:1 mapping. */
3855 PyErr_Clear();
3856 *result = NULL;
3857 return 0;
3858 } else
3859 return -1;
3860 }
3861 else if (x == Py_None) {
3862 *result = x;
3863 return 0;
3864 }
3865 else if (PyInt_Check(x)) {
3866 long value = PyInt_AS_LONG(x);
3867 long max = PyUnicode_GetMax();
3868 if (value < 0 || value > max) {
3869 PyErr_Format(PyExc_TypeError,
3870 "character mapping must be in range(0x%lx)", max+1);
3871 Py_DECREF(x);
3872 return -1;
3873 }
3874 *result = x;
3875 return 0;
3876 }
3877 else if (PyUnicode_Check(x)) {
3878 *result = x;
3879 return 0;
3880 }
3881 else {
3882 /* wrong return value */
3883 PyErr_SetString(PyExc_TypeError,
3884 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003885 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003886 return -1;
3887 }
3888}
3889/* ensure that *outobj is at least requiredsize characters long,
3890if not reallocate and adjust various state variables.
3891Return 0 on success, -1 on error */
3892static
Walter Dörwald4894c302003-10-24 14:25:28 +00003893int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003894 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003895{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003896 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003897 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003898 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003899 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003900 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003901 if (requiredsize < 2 * oldsize)
3902 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003903 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003904 return -1;
3905 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003906 }
3907 return 0;
3908}
3909/* lookup the character, put the result in the output string and adjust
3910 various state variables. Return a new reference to the object that
3911 was put in the output buffer in *result, or Py_None, if the mapping was
3912 undefined (in which case no character was written).
3913 The called must decref result.
3914 Return 0 on success, -1 on error. */
3915static
Walter Dörwald4894c302003-10-24 14:25:28 +00003916int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003917 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003918 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003919{
Walter Dörwald4894c302003-10-24 14:25:28 +00003920 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003921 return -1;
3922 if (*res==NULL) {
3923 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003924 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003925 }
3926 else if (*res==Py_None)
3927 ;
3928 else if (PyInt_Check(*res)) {
3929 /* no overflow check, because we know that the space is enough */
3930 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3931 }
3932 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003933 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003934 if (repsize==1) {
3935 /* no overflow check, because we know that the space is enough */
3936 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3937 }
3938 else if (repsize!=0) {
3939 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003940 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003941 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003942 repsize - 1;
3943 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003944 return -1;
3945 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3946 *outp += repsize;
3947 }
3948 }
3949 else
3950 return -1;
3951 return 0;
3952}
3953
3954PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003955 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 PyObject *mapping,
3957 const char *errors)
3958{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003959 /* output object */
3960 PyObject *res = NULL;
3961 /* pointers to the beginning and end+1 of input */
3962 const Py_UNICODE *startp = p;
3963 const Py_UNICODE *endp = p + size;
3964 /* pointer into the output */
3965 Py_UNICODE *str;
3966 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003967 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003968 char *reason = "character maps to <undefined>";
3969 PyObject *errorHandler = NULL;
3970 PyObject *exc = NULL;
3971 /* the following variable is used for caching string comparisons
3972 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3973 * 3=ignore, 4=xmlcharrefreplace */
3974 int known_errorHandler = -1;
3975
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976 if (mapping == NULL) {
3977 PyErr_BadArgument();
3978 return NULL;
3979 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003980
3981 /* allocate enough for a simple 1:1 translation without
3982 replacements, if we need more, we'll resize */
3983 res = PyUnicode_FromUnicode(NULL, size);
3984 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003985 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003987 return res;
3988 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990 while (p<endp) {
3991 /* try to encode it */
3992 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003993 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003994 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 goto onError;
3996 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003997 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003998 if (x!=Py_None) /* it worked => adjust input pointer */
3999 ++p;
4000 else { /* untranslatable character */
4001 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004002 Py_ssize_t repsize;
4003 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004004 Py_UNICODE *uni2;
4005 /* startpos for collecting untranslatable chars */
4006 const Py_UNICODE *collstart = p;
4007 const Py_UNICODE *collend = p+1;
4008 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004010 /* find all untranslatable characters */
4011 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004012 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004013 goto onError;
4014 Py_XDECREF(x);
4015 if (x!=Py_None)
4016 break;
4017 ++collend;
4018 }
4019 /* cache callback name lookup
4020 * (if not done yet, i.e. it's the first error) */
4021 if (known_errorHandler==-1) {
4022 if ((errors==NULL) || (!strcmp(errors, "strict")))
4023 known_errorHandler = 1;
4024 else if (!strcmp(errors, "replace"))
4025 known_errorHandler = 2;
4026 else if (!strcmp(errors, "ignore"))
4027 known_errorHandler = 3;
4028 else if (!strcmp(errors, "xmlcharrefreplace"))
4029 known_errorHandler = 4;
4030 else
4031 known_errorHandler = 0;
4032 }
4033 switch (known_errorHandler) {
4034 case 1: /* strict */
4035 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4036 goto onError;
4037 case 2: /* replace */
4038 /* No need to check for space, this is a 1:1 replacement */
4039 for (coll = collstart; coll<collend; ++coll)
4040 *str++ = '?';
4041 /* fall through */
4042 case 3: /* ignore */
4043 p = collend;
4044 break;
4045 case 4: /* xmlcharrefreplace */
4046 /* generate replacement (temporarily (mis)uses p) */
4047 for (p = collstart; p < collend; ++p) {
4048 char buffer[2+29+1+1];
4049 char *cp;
4050 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004051 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004052 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4053 goto onError;
4054 for (cp = buffer; *cp; ++cp)
4055 *str++ = *cp;
4056 }
4057 p = collend;
4058 break;
4059 default:
4060 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4061 reason, startp, size, &exc,
4062 collstart-startp, collend-startp, &newpos);
4063 if (repunicode == NULL)
4064 goto onError;
4065 /* generate replacement */
4066 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004067 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4069 Py_DECREF(repunicode);
4070 goto onError;
4071 }
4072 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4073 *str++ = *uni2;
4074 p = startp + newpos;
4075 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076 }
4077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079 /* Resize if we allocated to much */
4080 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004081 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004082 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004083 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004084 }
4085 Py_XDECREF(exc);
4086 Py_XDECREF(errorHandler);
4087 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004088
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089 onError:
4090 Py_XDECREF(res);
4091 Py_XDECREF(exc);
4092 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093 return NULL;
4094}
4095
4096PyObject *PyUnicode_Translate(PyObject *str,
4097 PyObject *mapping,
4098 const char *errors)
4099{
4100 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004101
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102 str = PyUnicode_FromObject(str);
4103 if (str == NULL)
4104 goto onError;
4105 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4106 PyUnicode_GET_SIZE(str),
4107 mapping,
4108 errors);
4109 Py_DECREF(str);
4110 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004111
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112 onError:
4113 Py_XDECREF(str);
4114 return NULL;
4115}
Tim Petersced69f82003-09-16 20:30:58 +00004116
Guido van Rossum9e896b32000-04-05 20:11:21 +00004117/* --- Decimal Encoder ---------------------------------------------------- */
4118
4119int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004120 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004121 char *output,
4122 const char *errors)
4123{
4124 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125 PyObject *errorHandler = NULL;
4126 PyObject *exc = NULL;
4127 const char *encoding = "decimal";
4128 const char *reason = "invalid decimal Unicode string";
4129 /* the following variable is used for caching string comparisons
4130 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4131 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004132
4133 if (output == NULL) {
4134 PyErr_BadArgument();
4135 return -1;
4136 }
4137
4138 p = s;
4139 end = s + length;
4140 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004142 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004144 Py_ssize_t repsize;
4145 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146 Py_UNICODE *uni2;
4147 Py_UNICODE *collstart;
4148 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004149
Guido van Rossum9e896b32000-04-05 20:11:21 +00004150 if (Py_UNICODE_ISSPACE(ch)) {
4151 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004152 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004153 continue;
4154 }
4155 decimal = Py_UNICODE_TODECIMAL(ch);
4156 if (decimal >= 0) {
4157 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004159 continue;
4160 }
Guido van Rossumba477042000-04-06 18:18:10 +00004161 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004162 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004164 continue;
4165 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166 /* All other characters are considered unencodable */
4167 collstart = p;
4168 collend = p+1;
4169 while (collend < end) {
4170 if ((0 < *collend && *collend < 256) ||
4171 !Py_UNICODE_ISSPACE(*collend) ||
4172 Py_UNICODE_TODECIMAL(*collend))
4173 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004174 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175 /* cache callback name lookup
4176 * (if not done yet, i.e. it's the first error) */
4177 if (known_errorHandler==-1) {
4178 if ((errors==NULL) || (!strcmp(errors, "strict")))
4179 known_errorHandler = 1;
4180 else if (!strcmp(errors, "replace"))
4181 known_errorHandler = 2;
4182 else if (!strcmp(errors, "ignore"))
4183 known_errorHandler = 3;
4184 else if (!strcmp(errors, "xmlcharrefreplace"))
4185 known_errorHandler = 4;
4186 else
4187 known_errorHandler = 0;
4188 }
4189 switch (known_errorHandler) {
4190 case 1: /* strict */
4191 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4192 goto onError;
4193 case 2: /* replace */
4194 for (p = collstart; p < collend; ++p)
4195 *output++ = '?';
4196 /* fall through */
4197 case 3: /* ignore */
4198 p = collend;
4199 break;
4200 case 4: /* xmlcharrefreplace */
4201 /* generate replacement (temporarily (mis)uses p) */
4202 for (p = collstart; p < collend; ++p)
4203 output += sprintf(output, "&#%d;", (int)*p);
4204 p = collend;
4205 break;
4206 default:
4207 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4208 encoding, reason, s, length, &exc,
4209 collstart-s, collend-s, &newpos);
4210 if (repunicode == NULL)
4211 goto onError;
4212 /* generate replacement */
4213 repsize = PyUnicode_GET_SIZE(repunicode);
4214 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4215 Py_UNICODE ch = *uni2;
4216 if (Py_UNICODE_ISSPACE(ch))
4217 *output++ = ' ';
4218 else {
4219 decimal = Py_UNICODE_TODECIMAL(ch);
4220 if (decimal >= 0)
4221 *output++ = '0' + decimal;
4222 else if (0 < ch && ch < 256)
4223 *output++ = (char)ch;
4224 else {
4225 Py_DECREF(repunicode);
4226 raise_encode_exception(&exc, encoding,
4227 s, length, collstart-s, collend-s, reason);
4228 goto onError;
4229 }
4230 }
4231 }
4232 p = s + newpos;
4233 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004234 }
4235 }
4236 /* 0-terminate the output string */
4237 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004238 Py_XDECREF(exc);
4239 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004240 return 0;
4241
4242 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004243 Py_XDECREF(exc);
4244 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004245 return -1;
4246}
4247
Guido van Rossumd57fd912000-03-10 22:53:23 +00004248/* --- Helpers ------------------------------------------------------------ */
4249
Fredrik Lundha50d2012006-05-26 17:04:58 +00004250#define STRINGLIB_CHAR Py_UNICODE
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004251
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004252#define STRINGLIB_LEN PyUnicode_GET_SIZE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004253#define STRINGLIB_NEW PyUnicode_FromUnicode
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004254#define STRINGLIB_STR PyUnicode_AS_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004255
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004256Py_LOCAL_INLINE(int)
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004257STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4258{
Fredrik Lundh9c0e9c02006-05-26 18:24:15 +00004259 if (str[0] != other[0])
4260 return 1;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004261 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4262}
4263
Fredrik Lundhb9479482006-05-26 17:22:38 +00004264#define STRINGLIB_EMPTY unicode_empty
4265
Fredrik Lundha50d2012006-05-26 17:04:58 +00004266#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004267
4268#include "stringlib/count.h"
4269#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00004270#include "stringlib/partition.h"
4271
Fredrik Lundhc8162812006-05-26 19:33:03 +00004272/* helper macro to fixup start/end slice values */
4273#define FIX_START_END(obj) \
4274 if (start < 0) \
4275 start += (obj)->length; \
4276 if (start < 0) \
4277 start = 0; \
4278 if (end > (obj)->length) \
4279 end = (obj)->length; \
4280 if (end < 0) \
4281 end += (obj)->length; \
4282 if (end < 0) \
4283 end = 0;
4284
Martin v. Löwis18e16552006-02-15 17:27:45 +00004285Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004286 PyObject *substr,
4287 Py_ssize_t start,
4288 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004289{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004290 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004291 PyUnicodeObject* str_obj;
4292 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004293
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004294 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4295 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004296 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004297 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4298 if (!sub_obj) {
4299 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300 return -1;
4301 }
Tim Petersced69f82003-09-16 20:30:58 +00004302
Fredrik Lundhc8162812006-05-26 19:33:03 +00004303 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004304
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004305 result = stringlib_count(
4306 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4307 );
4308
4309 Py_DECREF(sub_obj);
4310 Py_DECREF(str_obj);
4311
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312 return result;
4313}
4314
Martin v. Löwis18e16552006-02-15 17:27:45 +00004315Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004316 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004317 Py_ssize_t start,
4318 Py_ssize_t end,
4319 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004320{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004321 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004322
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004323 str = PyUnicode_FromObject(str);
4324 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004325 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004326 sub = PyUnicode_FromObject(sub);
4327 if (!sub) {
4328 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004329 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330 }
Tim Petersced69f82003-09-16 20:30:58 +00004331
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004332 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004333 result = stringlib_find_slice(
4334 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4335 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4336 start, end
4337 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004338 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004339 result = stringlib_rfind_slice(
4340 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4341 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4342 start, end
4343 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004344
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004345 Py_DECREF(str);
4346 Py_DECREF(sub);
4347
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348 return result;
4349}
4350
Tim Petersced69f82003-09-16 20:30:58 +00004351static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004352int tailmatch(PyUnicodeObject *self,
4353 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004354 Py_ssize_t start,
4355 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004356 int direction)
4357{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358 if (substring->length == 0)
4359 return 1;
4360
Fredrik Lundhc8162812006-05-26 19:33:03 +00004361 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362
4363 end -= substring->length;
4364 if (end < start)
4365 return 0;
4366
4367 if (direction > 0) {
4368 if (Py_UNICODE_MATCH(self, end, substring))
4369 return 1;
4370 } else {
4371 if (Py_UNICODE_MATCH(self, start, substring))
4372 return 1;
4373 }
4374
4375 return 0;
4376}
4377
Martin v. Löwis18e16552006-02-15 17:27:45 +00004378Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004380 Py_ssize_t start,
4381 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004382 int direction)
4383{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004384 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004385
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386 str = PyUnicode_FromObject(str);
4387 if (str == NULL)
4388 return -1;
4389 substr = PyUnicode_FromObject(substr);
4390 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004391 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392 return -1;
4393 }
Tim Petersced69f82003-09-16 20:30:58 +00004394
Guido van Rossumd57fd912000-03-10 22:53:23 +00004395 result = tailmatch((PyUnicodeObject *)str,
4396 (PyUnicodeObject *)substr,
4397 start, end, direction);
4398 Py_DECREF(str);
4399 Py_DECREF(substr);
4400 return result;
4401}
4402
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403/* Apply fixfct filter to the Unicode object self and return a
4404 reference to the modified object */
4405
Tim Petersced69f82003-09-16 20:30:58 +00004406static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407PyObject *fixup(PyUnicodeObject *self,
4408 int (*fixfct)(PyUnicodeObject *s))
4409{
4410
4411 PyUnicodeObject *u;
4412
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004413 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 if (u == NULL)
4415 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004416
4417 Py_UNICODE_COPY(u->str, self->str, self->length);
4418
Tim Peters7a29bd52001-09-12 03:03:31 +00004419 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420 /* fixfct should return TRUE if it modified the buffer. If
4421 FALSE, return a reference to the original buffer instead
4422 (to save space, not time) */
4423 Py_INCREF(self);
4424 Py_DECREF(u);
4425 return (PyObject*) self;
4426 }
4427 return (PyObject*) u;
4428}
4429
Tim Petersced69f82003-09-16 20:30:58 +00004430static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431int fixupper(PyUnicodeObject *self)
4432{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004433 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434 Py_UNICODE *s = self->str;
4435 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004436
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437 while (len-- > 0) {
4438 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004439
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440 ch = Py_UNICODE_TOUPPER(*s);
4441 if (ch != *s) {
4442 status = 1;
4443 *s = ch;
4444 }
4445 s++;
4446 }
4447
4448 return status;
4449}
4450
Tim Petersced69f82003-09-16 20:30:58 +00004451static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452int fixlower(PyUnicodeObject *self)
4453{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004454 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 Py_UNICODE *s = self->str;
4456 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004457
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 while (len-- > 0) {
4459 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004460
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461 ch = Py_UNICODE_TOLOWER(*s);
4462 if (ch != *s) {
4463 status = 1;
4464 *s = ch;
4465 }
4466 s++;
4467 }
4468
4469 return status;
4470}
4471
Tim Petersced69f82003-09-16 20:30:58 +00004472static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473int fixswapcase(PyUnicodeObject *self)
4474{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004475 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 Py_UNICODE *s = self->str;
4477 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004478
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479 while (len-- > 0) {
4480 if (Py_UNICODE_ISUPPER(*s)) {
4481 *s = Py_UNICODE_TOLOWER(*s);
4482 status = 1;
4483 } else if (Py_UNICODE_ISLOWER(*s)) {
4484 *s = Py_UNICODE_TOUPPER(*s);
4485 status = 1;
4486 }
4487 s++;
4488 }
4489
4490 return status;
4491}
4492
Tim Petersced69f82003-09-16 20:30:58 +00004493static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494int fixcapitalize(PyUnicodeObject *self)
4495{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004496 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004497 Py_UNICODE *s = self->str;
4498 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004499
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004500 if (len == 0)
4501 return 0;
4502 if (Py_UNICODE_ISLOWER(*s)) {
4503 *s = Py_UNICODE_TOUPPER(*s);
4504 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004505 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004506 s++;
4507 while (--len > 0) {
4508 if (Py_UNICODE_ISUPPER(*s)) {
4509 *s = Py_UNICODE_TOLOWER(*s);
4510 status = 1;
4511 }
4512 s++;
4513 }
4514 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515}
4516
4517static
4518int fixtitle(PyUnicodeObject *self)
4519{
4520 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4521 register Py_UNICODE *e;
4522 int previous_is_cased;
4523
4524 /* Shortcut for single character strings */
4525 if (PyUnicode_GET_SIZE(self) == 1) {
4526 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4527 if (*p != ch) {
4528 *p = ch;
4529 return 1;
4530 }
4531 else
4532 return 0;
4533 }
Tim Petersced69f82003-09-16 20:30:58 +00004534
Guido van Rossumd57fd912000-03-10 22:53:23 +00004535 e = p + PyUnicode_GET_SIZE(self);
4536 previous_is_cased = 0;
4537 for (; p < e; p++) {
4538 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004539
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540 if (previous_is_cased)
4541 *p = Py_UNICODE_TOLOWER(ch);
4542 else
4543 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004544
4545 if (Py_UNICODE_ISLOWER(ch) ||
4546 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004547 Py_UNICODE_ISTITLE(ch))
4548 previous_is_cased = 1;
4549 else
4550 previous_is_cased = 0;
4551 }
4552 return 1;
4553}
4554
Tim Peters8ce9f162004-08-27 01:49:32 +00004555PyObject *
4556PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004557{
Tim Peters8ce9f162004-08-27 01:49:32 +00004558 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004559 const Py_UNICODE blank = ' ';
4560 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004561 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004562 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004563 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4564 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004565 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4566 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004567 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004568 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004569 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570
Tim Peters05eba1f2004-08-27 21:32:02 +00004571 fseq = PySequence_Fast(seq, "");
4572 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004573 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004574 }
4575
Tim Peters91879ab2004-08-27 22:35:44 +00004576 /* Grrrr. A codec may be invoked to convert str objects to
4577 * Unicode, and so it's possible to call back into Python code
4578 * during PyUnicode_FromObject(), and so it's possible for a sick
4579 * codec to change the size of fseq (if seq is a list). Therefore
4580 * we have to keep refetching the size -- can't assume seqlen
4581 * is invariant.
4582 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004583 seqlen = PySequence_Fast_GET_SIZE(fseq);
4584 /* If empty sequence, return u"". */
4585 if (seqlen == 0) {
4586 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4587 goto Done;
4588 }
4589 /* If singleton sequence with an exact Unicode, return that. */
4590 if (seqlen == 1) {
4591 item = PySequence_Fast_GET_ITEM(fseq, 0);
4592 if (PyUnicode_CheckExact(item)) {
4593 Py_INCREF(item);
4594 res = (PyUnicodeObject *)item;
4595 goto Done;
4596 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004597 }
4598
Tim Peters05eba1f2004-08-27 21:32:02 +00004599 /* At least two items to join, or one that isn't exact Unicode. */
4600 if (seqlen > 1) {
4601 /* Set up sep and seplen -- they're needed. */
4602 if (separator == NULL) {
4603 sep = &blank;
4604 seplen = 1;
4605 }
4606 else {
4607 internal_separator = PyUnicode_FromObject(separator);
4608 if (internal_separator == NULL)
4609 goto onError;
4610 sep = PyUnicode_AS_UNICODE(internal_separator);
4611 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004612 /* In case PyUnicode_FromObject() mutated seq. */
4613 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004614 }
4615 }
4616
4617 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004618 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004619 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004620 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004621 res_p = PyUnicode_AS_UNICODE(res);
4622 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004623
Tim Peters05eba1f2004-08-27 21:32:02 +00004624 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004625 Py_ssize_t itemlen;
4626 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004627
4628 item = PySequence_Fast_GET_ITEM(fseq, i);
4629 /* Convert item to Unicode. */
4630 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4631 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004632 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004633 " %.80s found",
4634 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004635 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004636 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004637 item = PyUnicode_FromObject(item);
4638 if (item == NULL)
4639 goto onError;
4640 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004641
Tim Peters91879ab2004-08-27 22:35:44 +00004642 /* In case PyUnicode_FromObject() mutated seq. */
4643 seqlen = PySequence_Fast_GET_SIZE(fseq);
4644
Tim Peters8ce9f162004-08-27 01:49:32 +00004645 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004647 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004648 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004649 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004650 if (i < seqlen - 1) {
4651 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004652 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004653 goto Overflow;
4654 }
4655 if (new_res_used > res_alloc) {
4656 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004657 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004658 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004659 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004660 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004661 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004662 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004663 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004665 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004666 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004668
4669 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004670 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004671 res_p += itemlen;
4672 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004673 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004674 res_p += seplen;
4675 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004676 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004677 res_used = new_res_used;
4678 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004679
Tim Peters05eba1f2004-08-27 21:32:02 +00004680 /* Shrink res to match the used area; this probably can't fail,
4681 * but it's cheap to check.
4682 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004683 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004684 goto onError;
4685
4686 Done:
4687 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004688 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689 return (PyObject *)res;
4690
Tim Peters8ce9f162004-08-27 01:49:32 +00004691 Overflow:
4692 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00004693 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004694 Py_DECREF(item);
4695 /* fall through */
4696
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004698 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004699 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004700 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701 return NULL;
4702}
4703
Tim Petersced69f82003-09-16 20:30:58 +00004704static
4705PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004706 Py_ssize_t left,
4707 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004708 Py_UNICODE fill)
4709{
4710 PyUnicodeObject *u;
4711
4712 if (left < 0)
4713 left = 0;
4714 if (right < 0)
4715 right = 0;
4716
Tim Peters7a29bd52001-09-12 03:03:31 +00004717 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718 Py_INCREF(self);
4719 return self;
4720 }
4721
4722 u = _PyUnicode_New(left + self->length + right);
4723 if (u) {
4724 if (left)
4725 Py_UNICODE_FILL(u->str, fill, left);
4726 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4727 if (right)
4728 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4729 }
4730
4731 return u;
4732}
4733
4734#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004735 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736 if (!str) \
4737 goto onError; \
4738 if (PyList_Append(list, str)) { \
4739 Py_DECREF(str); \
4740 goto onError; \
4741 } \
4742 else \
4743 Py_DECREF(str);
4744
4745static
4746PyObject *split_whitespace(PyUnicodeObject *self,
4747 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004748 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004750 register Py_ssize_t i;
4751 register Py_ssize_t j;
4752 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753 PyObject *str;
4754
4755 for (i = j = 0; i < len; ) {
4756 /* find a token */
4757 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4758 i++;
4759 j = i;
4760 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4761 i++;
4762 if (j < i) {
4763 if (maxcount-- <= 0)
4764 break;
4765 SPLIT_APPEND(self->str, j, i);
4766 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4767 i++;
4768 j = i;
4769 }
4770 }
4771 if (j < len) {
4772 SPLIT_APPEND(self->str, j, len);
4773 }
4774 return list;
4775
4776 onError:
4777 Py_DECREF(list);
4778 return NULL;
4779}
4780
4781PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004782 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004784 register Py_ssize_t i;
4785 register Py_ssize_t j;
4786 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004787 PyObject *list;
4788 PyObject *str;
4789 Py_UNICODE *data;
4790
4791 string = PyUnicode_FromObject(string);
4792 if (string == NULL)
4793 return NULL;
4794 data = PyUnicode_AS_UNICODE(string);
4795 len = PyUnicode_GET_SIZE(string);
4796
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797 list = PyList_New(0);
4798 if (!list)
4799 goto onError;
4800
4801 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004802 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004803
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004805 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807
4808 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004809 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 if (i < len) {
4811 if (data[i] == '\r' && i + 1 < len &&
4812 data[i+1] == '\n')
4813 i += 2;
4814 else
4815 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004816 if (keepends)
4817 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818 }
Guido van Rossum86662912000-04-11 15:38:46 +00004819 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820 j = i;
4821 }
4822 if (j < len) {
4823 SPLIT_APPEND(data, j, len);
4824 }
4825
4826 Py_DECREF(string);
4827 return list;
4828
4829 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004830 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 Py_DECREF(string);
4832 return NULL;
4833}
4834
Tim Petersced69f82003-09-16 20:30:58 +00004835static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836PyObject *split_char(PyUnicodeObject *self,
4837 PyObject *list,
4838 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004839 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004841 register Py_ssize_t i;
4842 register Py_ssize_t j;
4843 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844 PyObject *str;
4845
4846 for (i = j = 0; i < len; ) {
4847 if (self->str[i] == ch) {
4848 if (maxcount-- <= 0)
4849 break;
4850 SPLIT_APPEND(self->str, j, i);
4851 i = j = i + 1;
4852 } else
4853 i++;
4854 }
4855 if (j <= len) {
4856 SPLIT_APPEND(self->str, j, len);
4857 }
4858 return list;
4859
4860 onError:
4861 Py_DECREF(list);
4862 return NULL;
4863}
4864
Tim Petersced69f82003-09-16 20:30:58 +00004865static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866PyObject *split_substring(PyUnicodeObject *self,
4867 PyObject *list,
4868 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004869 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004871 register Py_ssize_t i;
4872 register Py_ssize_t j;
4873 Py_ssize_t len = self->length;
4874 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 PyObject *str;
4876
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004877 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878 if (Py_UNICODE_MATCH(self, i, substring)) {
4879 if (maxcount-- <= 0)
4880 break;
4881 SPLIT_APPEND(self->str, j, i);
4882 i = j = i + sublen;
4883 } else
4884 i++;
4885 }
4886 if (j <= len) {
4887 SPLIT_APPEND(self->str, j, len);
4888 }
4889 return list;
4890
4891 onError:
4892 Py_DECREF(list);
4893 return NULL;
4894}
4895
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004896static
4897PyObject *rsplit_whitespace(PyUnicodeObject *self,
4898 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004899 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004900{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004901 register Py_ssize_t i;
4902 register Py_ssize_t j;
4903 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004904 PyObject *str;
4905
4906 for (i = j = len - 1; i >= 0; ) {
4907 /* find a token */
4908 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4909 i--;
4910 j = i;
4911 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4912 i--;
4913 if (j > i) {
4914 if (maxcount-- <= 0)
4915 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004916 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004917 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4918 i--;
4919 j = i;
4920 }
4921 }
4922 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004923 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004924 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004925 if (PyList_Reverse(list) < 0)
4926 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004927 return list;
4928
4929 onError:
4930 Py_DECREF(list);
4931 return NULL;
4932}
4933
4934static
4935PyObject *rsplit_char(PyUnicodeObject *self,
4936 PyObject *list,
4937 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004938 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004939{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004940 register Py_ssize_t i;
4941 register Py_ssize_t j;
4942 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004943 PyObject *str;
4944
4945 for (i = j = len - 1; i >= 0; ) {
4946 if (self->str[i] == ch) {
4947 if (maxcount-- <= 0)
4948 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004949 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004950 j = i = i - 1;
4951 } else
4952 i--;
4953 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004954 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004955 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004956 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004957 if (PyList_Reverse(list) < 0)
4958 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004959 return list;
4960
4961 onError:
4962 Py_DECREF(list);
4963 return NULL;
4964}
4965
4966static
4967PyObject *rsplit_substring(PyUnicodeObject *self,
4968 PyObject *list,
4969 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004970 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004971{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004972 register Py_ssize_t i;
4973 register Py_ssize_t j;
4974 Py_ssize_t len = self->length;
4975 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004976 PyObject *str;
4977
4978 for (i = len - sublen, j = len; i >= 0; ) {
4979 if (Py_UNICODE_MATCH(self, i, substring)) {
4980 if (maxcount-- <= 0)
4981 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004982 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004983 j = i;
4984 i -= sublen;
4985 } else
4986 i--;
4987 }
4988 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004989 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004990 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004991 if (PyList_Reverse(list) < 0)
4992 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004993 return list;
4994
4995 onError:
4996 Py_DECREF(list);
4997 return NULL;
4998}
4999
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000#undef SPLIT_APPEND
5001
5002static
5003PyObject *split(PyUnicodeObject *self,
5004 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005005 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006{
5007 PyObject *list;
5008
5009 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005010 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005011
5012 list = PyList_New(0);
5013 if (!list)
5014 return NULL;
5015
5016 if (substring == NULL)
5017 return split_whitespace(self,list,maxcount);
5018
5019 else if (substring->length == 1)
5020 return split_char(self,list,substring->str[0],maxcount);
5021
5022 else if (substring->length == 0) {
5023 Py_DECREF(list);
5024 PyErr_SetString(PyExc_ValueError, "empty separator");
5025 return NULL;
5026 }
5027 else
5028 return split_substring(self,list,substring,maxcount);
5029}
5030
Tim Petersced69f82003-09-16 20:30:58 +00005031static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005032PyObject *rsplit(PyUnicodeObject *self,
5033 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005034 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005035{
5036 PyObject *list;
5037
5038 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005039 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005040
5041 list = PyList_New(0);
5042 if (!list)
5043 return NULL;
5044
5045 if (substring == NULL)
5046 return rsplit_whitespace(self,list,maxcount);
5047
5048 else if (substring->length == 1)
5049 return rsplit_char(self,list,substring->str[0],maxcount);
5050
5051 else if (substring->length == 0) {
5052 Py_DECREF(list);
5053 PyErr_SetString(PyExc_ValueError, "empty separator");
5054 return NULL;
5055 }
5056 else
5057 return rsplit_substring(self,list,substring,maxcount);
5058}
5059
5060static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005061PyObject *replace(PyUnicodeObject *self,
5062 PyUnicodeObject *str1,
5063 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005064 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065{
5066 PyUnicodeObject *u;
5067
5068 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005069 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070
Fredrik Lundh347ee272006-05-24 16:35:18 +00005071 if (str1->length == str2->length) {
5072 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005073 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005074 if (str1->length == 1) {
5075 /* replace characters */
5076 Py_UNICODE u1, u2;
5077 if (!findchar(self->str, self->length, str1->str[0]))
5078 goto nothing;
5079 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5080 if (!u)
5081 return NULL;
5082 Py_UNICODE_COPY(u->str, self->str, self->length);
5083 u1 = str1->str[0];
5084 u2 = str2->str[0];
5085 for (i = 0; i < u->length; i++)
5086 if (u->str[i] == u1) {
5087 if (--maxcount < 0)
5088 break;
5089 u->str[i] = u2;
5090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005092 i = fastsearch(
5093 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005094 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005095 if (i < 0)
5096 goto nothing;
5097 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5098 if (!u)
5099 return NULL;
5100 Py_UNICODE_COPY(u->str, self->str, self->length);
5101 while (i <= self->length - str1->length)
5102 if (Py_UNICODE_MATCH(self, i, str1)) {
5103 if (--maxcount < 0)
5104 break;
5105 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5106 i += str1->length;
5107 } else
5108 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005111
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005112 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005113 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114 Py_UNICODE *p;
5115
5116 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005117 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118 if (n > maxcount)
5119 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005120 if (n == 0)
5121 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005122 /* new_size = self->length + n * (str2->length - str1->length)); */
5123 delta = (str2->length - str1->length);
5124 if (delta == 0) {
5125 new_size = self->length;
5126 } else {
5127 product = n * (str2->length - str1->length);
5128 if ((product / (str2->length - str1->length)) != n) {
5129 PyErr_SetString(PyExc_OverflowError,
5130 "replace string is too long");
5131 return NULL;
5132 }
5133 new_size = self->length + product;
5134 if (new_size < 0) {
5135 PyErr_SetString(PyExc_OverflowError,
5136 "replace string is too long");
5137 return NULL;
5138 }
5139 }
5140 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005141 if (!u)
5142 return NULL;
5143 i = 0;
5144 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005145 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005146 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005147 while (n-- > 0) {
5148 /* look for next match */
5149 j = i;
5150 while (j <= e) {
5151 if (Py_UNICODE_MATCH(self, j, str1))
5152 break;
5153 j++;
5154 }
5155 if (j > i) {
5156 if (j > e)
5157 break;
5158 /* copy unchanged part [i:j] */
5159 Py_UNICODE_COPY(p, self->str+i, j-i);
5160 p += j - i;
5161 }
5162 /* copy substitution string */
5163 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005164 Py_UNICODE_COPY(p, str2->str, str2->length);
5165 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005166 }
5167 i = j + str1->length;
5168 }
5169 if (i < self->length)
5170 /* copy tail [i:] */
5171 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005172 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005173 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005174 while (n > 0) {
5175 Py_UNICODE_COPY(p, str2->str, str2->length);
5176 p += str2->length;
5177 if (--n <= 0)
5178 break;
5179 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005181 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 }
5183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005185
5186nothing:
5187 /* nothing to replace; return original string (when possible) */
5188 if (PyUnicode_CheckExact(self)) {
5189 Py_INCREF(self);
5190 return (PyObject *) self;
5191 }
5192 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193}
5194
5195/* --- Unicode Object Methods --------------------------------------------- */
5196
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005197PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198"S.title() -> unicode\n\
5199\n\
5200Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005201characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202
5203static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005204unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206 return fixup(self, fixtitle);
5207}
5208
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005209PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210"S.capitalize() -> unicode\n\
5211\n\
5212Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005213have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214
5215static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005216unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218 return fixup(self, fixcapitalize);
5219}
5220
5221#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005222PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223"S.capwords() -> unicode\n\
5224\n\
5225Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005226normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227
5228static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005229unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230{
5231 PyObject *list;
5232 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005233 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235 /* Split into words */
5236 list = split(self, NULL, -1);
5237 if (!list)
5238 return NULL;
5239
5240 /* Capitalize each word */
5241 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5242 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5243 fixcapitalize);
5244 if (item == NULL)
5245 goto onError;
5246 Py_DECREF(PyList_GET_ITEM(list, i));
5247 PyList_SET_ITEM(list, i, item);
5248 }
5249
5250 /* Join the words to form a new string */
5251 item = PyUnicode_Join(NULL, list);
5252
5253onError:
5254 Py_DECREF(list);
5255 return (PyObject *)item;
5256}
5257#endif
5258
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005259/* Argument converter. Coerces to a single unicode character */
5260
5261static int
5262convert_uc(PyObject *obj, void *addr)
5263{
5264 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5265 PyObject *uniobj;
5266 Py_UNICODE *unistr;
5267
5268 uniobj = PyUnicode_FromObject(obj);
5269 if (uniobj == NULL) {
5270 PyErr_SetString(PyExc_TypeError,
5271 "The fill character cannot be converted to Unicode");
5272 return 0;
5273 }
5274 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5275 PyErr_SetString(PyExc_TypeError,
5276 "The fill character must be exactly one character long");
5277 Py_DECREF(uniobj);
5278 return 0;
5279 }
5280 unistr = PyUnicode_AS_UNICODE(uniobj);
5281 *fillcharloc = unistr[0];
5282 Py_DECREF(uniobj);
5283 return 1;
5284}
5285
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005286PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005287"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005289Return S centered in a Unicode string of length width. Padding is\n\
5290done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291
5292static PyObject *
5293unicode_center(PyUnicodeObject *self, PyObject *args)
5294{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005295 Py_ssize_t marg, left;
5296 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005297 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298
Thomas Woutersde017742006-02-16 19:34:37 +00005299 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300 return NULL;
5301
Tim Peters7a29bd52001-09-12 03:03:31 +00005302 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 Py_INCREF(self);
5304 return (PyObject*) self;
5305 }
5306
5307 marg = width - self->length;
5308 left = marg / 2 + (marg & width & 1);
5309
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005310 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311}
5312
Marc-André Lemburge5034372000-08-08 08:04:29 +00005313#if 0
5314
5315/* This code should go into some future Unicode collation support
5316 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005317 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005318
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005319/* speedy UTF-16 code point order comparison */
5320/* gleaned from: */
5321/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5322
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005323static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005324{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005325 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005326 0, 0, 0, 0, 0, 0, 0, 0,
5327 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005328 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005329};
5330
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331static int
5332unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5333{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005334 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005335
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 Py_UNICODE *s1 = str1->str;
5337 Py_UNICODE *s2 = str2->str;
5338
5339 len1 = str1->length;
5340 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005341
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005343 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005344
5345 c1 = *s1++;
5346 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005347
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005348 if (c1 > (1<<11) * 26)
5349 c1 += utf16Fixup[c1>>11];
5350 if (c2 > (1<<11) * 26)
5351 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005352 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005353
5354 if (c1 != c2)
5355 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005356
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005357 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 }
5359
5360 return (len1 < len2) ? -1 : (len1 != len2);
5361}
5362
Marc-André Lemburge5034372000-08-08 08:04:29 +00005363#else
5364
5365static int
5366unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5367{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005368 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005369
5370 Py_UNICODE *s1 = str1->str;
5371 Py_UNICODE *s2 = str2->str;
5372
5373 len1 = str1->length;
5374 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005375
Marc-André Lemburge5034372000-08-08 08:04:29 +00005376 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005377 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005378
Fredrik Lundh45714e92001-06-26 16:39:36 +00005379 c1 = *s1++;
5380 c2 = *s2++;
5381
5382 if (c1 != c2)
5383 return (c1 < c2) ? -1 : 1;
5384
Marc-André Lemburge5034372000-08-08 08:04:29 +00005385 len1--; len2--;
5386 }
5387
5388 return (len1 < len2) ? -1 : (len1 != len2);
5389}
5390
5391#endif
5392
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393int PyUnicode_Compare(PyObject *left,
5394 PyObject *right)
5395{
5396 PyUnicodeObject *u = NULL, *v = NULL;
5397 int result;
5398
5399 /* Coerce the two arguments */
5400 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5401 if (u == NULL)
5402 goto onError;
5403 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5404 if (v == NULL)
5405 goto onError;
5406
Thomas Wouters7e474022000-07-16 12:04:32 +00005407 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 if (v == u) {
5409 Py_DECREF(u);
5410 Py_DECREF(v);
5411 return 0;
5412 }
5413
5414 result = unicode_compare(u, v);
5415
5416 Py_DECREF(u);
5417 Py_DECREF(v);
5418 return result;
5419
5420onError:
5421 Py_XDECREF(u);
5422 Py_XDECREF(v);
5423 return -1;
5424}
5425
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00005426PyObject *PyUnicode_RichCompare(PyObject *left,
5427 PyObject *right,
5428 int op)
5429{
5430 int result;
5431
5432 result = PyUnicode_Compare(left, right);
5433 if (result == -1 && PyErr_Occurred())
5434 goto onError;
5435
5436 /* Convert the return value to a Boolean */
5437 switch (op) {
5438 case Py_EQ:
5439 result = (result == 0);
5440 break;
5441 case Py_NE:
5442 result = (result != 0);
5443 break;
5444 case Py_LE:
5445 result = (result <= 0);
5446 break;
5447 case Py_GE:
5448 result = (result >= 0);
5449 break;
5450 case Py_LT:
5451 result = (result == -1);
5452 break;
5453 case Py_GT:
5454 result = (result == 1);
5455 break;
5456 }
5457 return PyBool_FromLong(result);
5458
5459 onError:
5460
5461 /* Standard case
5462
5463 Type errors mean that PyUnicode_FromObject() could not convert
5464 one of the arguments (usually the right hand side) to Unicode,
5465 ie. we can't handle the comparison request. However, it is
5466 possible that the other object knows a comparison method, which
5467 is why we return Py_NotImplemented to give the other object a
5468 chance.
5469
5470 */
5471 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5472 PyErr_Clear();
5473 Py_INCREF(Py_NotImplemented);
5474 return Py_NotImplemented;
5475 }
5476 if (op != Py_EQ && op != Py_NE)
5477 return NULL;
5478
5479 /* Equality comparison.
5480
5481 This is a special case: we silence any PyExc_UnicodeDecodeError
5482 and instead turn it into a PyErr_UnicodeWarning.
5483
5484 */
5485 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5486 return NULL;
5487 PyErr_Clear();
5488 if (PyErr_Warn(PyExc_UnicodeWarning,
5489 (op == Py_EQ) ?
5490 "Unicode equal comparison "
5491 "failed to convert both arguments to Unicode - "
5492 "interpreting them as being unequal" :
5493 "Unicode unequal comparison "
5494 "failed to convert both arguments to Unicode - "
5495 "interpreting them as being unequal"
5496 ) < 0)
5497 return NULL;
5498 result = (op == Py_NE);
5499 return PyBool_FromLong(result);
5500}
5501
Guido van Rossum403d68b2000-03-13 15:55:09 +00005502int PyUnicode_Contains(PyObject *container,
5503 PyObject *element)
5504{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005505 PyObject *str, *sub;
5506 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005507
5508 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005509 sub = PyUnicode_FromObject(element);
5510 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005511 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005512 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005513 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005514 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005515
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005516 str = PyUnicode_FromObject(container);
5517 if (!str) {
5518 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005519 return -1;
5520 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005521
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005522 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00005523
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005524 Py_DECREF(str);
5525 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00005526
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005527 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005528}
5529
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530/* Concat to string or Unicode object giving a new Unicode object. */
5531
5532PyObject *PyUnicode_Concat(PyObject *left,
5533 PyObject *right)
5534{
5535 PyUnicodeObject *u = NULL, *v = NULL, *w;
5536
5537 /* Coerce the two arguments */
5538 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5539 if (u == NULL)
5540 goto onError;
5541 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5542 if (v == NULL)
5543 goto onError;
5544
5545 /* Shortcuts */
5546 if (v == unicode_empty) {
5547 Py_DECREF(v);
5548 return (PyObject *)u;
5549 }
5550 if (u == unicode_empty) {
5551 Py_DECREF(u);
5552 return (PyObject *)v;
5553 }
5554
5555 /* Concat the two Unicode strings */
5556 w = _PyUnicode_New(u->length + v->length);
5557 if (w == NULL)
5558 goto onError;
5559 Py_UNICODE_COPY(w->str, u->str, u->length);
5560 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5561
5562 Py_DECREF(u);
5563 Py_DECREF(v);
5564 return (PyObject *)w;
5565
5566onError:
5567 Py_XDECREF(u);
5568 Py_XDECREF(v);
5569 return NULL;
5570}
5571
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005572PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573"S.count(sub[, start[, end]]) -> int\n\
5574\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005575Return the number of non-overlapping occurrences of substring sub in\n\
5576Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005577interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578
5579static PyObject *
5580unicode_count(PyUnicodeObject *self, PyObject *args)
5581{
5582 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005583 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005584 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585 PyObject *result;
5586
Guido van Rossumb8872e62000-05-09 14:14:27 +00005587 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5588 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 return NULL;
5590
5591 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005592 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 if (substring == NULL)
5594 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005595
Fredrik Lundhc8162812006-05-26 19:33:03 +00005596 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005598 result = PyInt_FromSsize_t(
5599 stringlib_count(self->str + start, end - start,
5600 substring->str, substring->length)
5601 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602
5603 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005604
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 return result;
5606}
5607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005608PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005609"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005611Encodes S using the codec registered for encoding. encoding defaults\n\
5612to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005613handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005614a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5615'xmlcharrefreplace' as well as any other name registered with\n\
5616codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617
5618static PyObject *
5619unicode_encode(PyUnicodeObject *self, PyObject *args)
5620{
5621 char *encoding = NULL;
5622 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005623 PyObject *v;
5624
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5626 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005627 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005628 if (v == NULL)
5629 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005630 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5631 PyErr_Format(PyExc_TypeError,
5632 "encoder did not return a string/unicode object "
5633 "(type=%.400s)",
5634 v->ob_type->tp_name);
5635 Py_DECREF(v);
5636 return NULL;
5637 }
5638 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005639
5640 onError:
5641 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005642}
5643
5644PyDoc_STRVAR(decode__doc__,
5645"S.decode([encoding[,errors]]) -> string or unicode\n\
5646\n\
5647Decodes S using the codec registered for encoding. encoding defaults\n\
5648to the default encoding. errors may be given to set a different error\n\
5649handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5650a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5651as well as any other name registerd with codecs.register_error that is\n\
5652able to handle UnicodeDecodeErrors.");
5653
5654static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005655unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005656{
5657 char *encoding = NULL;
5658 char *errors = NULL;
5659 PyObject *v;
5660
5661 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5662 return NULL;
5663 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005664 if (v == NULL)
5665 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005666 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5667 PyErr_Format(PyExc_TypeError,
5668 "decoder did not return a string/unicode object "
5669 "(type=%.400s)",
5670 v->ob_type->tp_name);
5671 Py_DECREF(v);
5672 return NULL;
5673 }
5674 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005675
5676 onError:
5677 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678}
5679
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005680PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681"S.expandtabs([tabsize]) -> unicode\n\
5682\n\
5683Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005684If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685
5686static PyObject*
5687unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5688{
5689 Py_UNICODE *e;
5690 Py_UNICODE *p;
5691 Py_UNICODE *q;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005692 Py_UNICODE *qe;
5693 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 PyUnicodeObject *u;
5695 int tabsize = 8;
5696
5697 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5698 return NULL;
5699
Thomas Wouters7e474022000-07-16 12:04:32 +00005700 /* First pass: determine size of output string */
Guido van Rossum44a93e52008-03-11 21:14:54 +00005701 i = 0; /* chars up to and including most recent \n or \r */
5702 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
5703 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 for (p = self->str; p < e; p++)
5705 if (*p == '\t') {
Neal Norwitz66e64e22007-06-09 04:06:30 +00005706 if (tabsize > 0) {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005707 incr = tabsize - (j % tabsize); /* cannot overflow */
5708 if (j > PY_SSIZE_T_MAX - incr)
5709 goto overflow1;
5710 j += incr;
5711 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 }
5713 else {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005714 if (j > PY_SSIZE_T_MAX - 1)
5715 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 j++;
5717 if (*p == '\n' || *p == '\r') {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005718 if (i > PY_SSIZE_T_MAX - j)
5719 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 i += j;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005721 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722 }
5723 }
5724
Guido van Rossum44a93e52008-03-11 21:14:54 +00005725 if (i > PY_SSIZE_T_MAX - j)
5726 goto overflow1;
Neal Norwitz66e64e22007-06-09 04:06:30 +00005727
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 /* Second pass: create output string and fill it */
5729 u = _PyUnicode_New(i + j);
5730 if (!u)
5731 return NULL;
5732
Guido van Rossum44a93e52008-03-11 21:14:54 +00005733 j = 0; /* same as in first pass */
5734 q = u->str; /* next output char */
5735 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736
5737 for (p = self->str; p < e; p++)
5738 if (*p == '\t') {
5739 if (tabsize > 0) {
5740 i = tabsize - (j % tabsize);
5741 j += i;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005742 while (i--) {
5743 if (q >= qe)
5744 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 *q++ = ' ';
Guido van Rossum44a93e52008-03-11 21:14:54 +00005746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747 }
5748 }
5749 else {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005750 if (q >= qe)
5751 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752 *q++ = *p;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005753 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 if (*p == '\n' || *p == '\r')
5755 j = 0;
5756 }
5757
5758 return (PyObject*) u;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005759
5760 overflow2:
5761 Py_DECREF(u);
5762 overflow1:
5763 PyErr_SetString(PyExc_OverflowError, "new string is too long");
5764 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765}
5766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005767PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768"S.find(sub [,start [,end]]) -> int\n\
5769\n\
5770Return the lowest index in S where substring sub is found,\n\
Georg Brandlb4d100c2007-07-29 17:37:22 +00005771such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772arguments start and end are interpreted as in slice notation.\n\
5773\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005774Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775
5776static PyObject *
5777unicode_find(PyUnicodeObject *self, PyObject *args)
5778{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005779 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005780 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005781 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005782 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783
Guido van Rossumb8872e62000-05-09 14:14:27 +00005784 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5785 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005787 substring = PyUnicode_FromObject(substring);
5788 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 return NULL;
5790
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005791 result = stringlib_find_slice(
5792 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5793 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5794 start, end
5795 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796
5797 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005798
5799 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800}
5801
5802static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005803unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804{
5805 if (index < 0 || index >= self->length) {
5806 PyErr_SetString(PyExc_IndexError, "string index out of range");
5807 return NULL;
5808 }
5809
5810 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5811}
5812
5813static long
5814unicode_hash(PyUnicodeObject *self)
5815{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005816 /* Since Unicode objects compare equal to their ASCII string
5817 counterparts, they should use the individual character values
5818 as basis for their hash value. This is needed to assure that
5819 strings and Unicode objects behave in the same way as
5820 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821
Martin v. Löwis18e16552006-02-15 17:27:45 +00005822 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005823 register Py_UNICODE *p;
5824 register long x;
5825
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826 if (self->hash != -1)
5827 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005828 len = PyUnicode_GET_SIZE(self);
5829 p = PyUnicode_AS_UNICODE(self);
5830 x = *p << 7;
5831 while (--len >= 0)
5832 x = (1000003*x) ^ *p++;
5833 x ^= PyUnicode_GET_SIZE(self);
5834 if (x == -1)
5835 x = -2;
5836 self->hash = x;
5837 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838}
5839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005840PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841"S.index(sub [,start [,end]]) -> int\n\
5842\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005843Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844
5845static PyObject *
5846unicode_index(PyUnicodeObject *self, PyObject *args)
5847{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005848 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005849 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005850 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005851 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852
Guido van Rossumb8872e62000-05-09 14:14:27 +00005853 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5854 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005856 substring = PyUnicode_FromObject(substring);
5857 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858 return NULL;
5859
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005860 result = stringlib_find_slice(
5861 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5862 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5863 start, end
5864 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865
5866 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005867
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 if (result < 0) {
5869 PyErr_SetString(PyExc_ValueError, "substring not found");
5870 return NULL;
5871 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005872
Martin v. Löwis18e16552006-02-15 17:27:45 +00005873 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874}
5875
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005876PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005877"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005879Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005880at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881
5882static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005883unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884{
5885 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5886 register const Py_UNICODE *e;
5887 int cased;
5888
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889 /* Shortcut for single character strings */
5890 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005891 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005893 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005894 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005895 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005896
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 e = p + PyUnicode_GET_SIZE(self);
5898 cased = 0;
5899 for (; p < e; p++) {
5900 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005901
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005903 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 else if (!cased && Py_UNICODE_ISLOWER(ch))
5905 cased = 1;
5906 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005907 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908}
5909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005910PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005911"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005913Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005914at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915
5916static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005917unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918{
5919 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5920 register const Py_UNICODE *e;
5921 int cased;
5922
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 /* Shortcut for single character strings */
5924 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005925 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005927 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005928 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005929 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005930
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 e = p + PyUnicode_GET_SIZE(self);
5932 cased = 0;
5933 for (; p < e; p++) {
5934 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005935
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005937 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 else if (!cased && Py_UNICODE_ISUPPER(ch))
5939 cased = 1;
5940 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005941 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942}
5943
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005944PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005945"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005947Return True if S is a titlecased string and there is at least one\n\
5948character in S, i.e. upper- and titlecase characters may only\n\
5949follow uncased characters and lowercase characters only cased ones.\n\
5950Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951
5952static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005953unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954{
5955 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5956 register const Py_UNICODE *e;
5957 int cased, previous_is_cased;
5958
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 /* Shortcut for single character strings */
5960 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005961 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5962 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005964 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005965 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005966 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005967
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 e = p + PyUnicode_GET_SIZE(self);
5969 cased = 0;
5970 previous_is_cased = 0;
5971 for (; p < e; p++) {
5972 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005973
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5975 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005976 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 previous_is_cased = 1;
5978 cased = 1;
5979 }
5980 else if (Py_UNICODE_ISLOWER(ch)) {
5981 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005982 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 previous_is_cased = 1;
5984 cased = 1;
5985 }
5986 else
5987 previous_is_cased = 0;
5988 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005989 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990}
5991
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005992PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005993"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005995Return True if all characters in S are whitespace\n\
5996and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997
5998static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005999unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000{
6001 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6002 register const Py_UNICODE *e;
6003
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004 /* Shortcut for single character strings */
6005 if (PyUnicode_GET_SIZE(self) == 1 &&
6006 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006007 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006009 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006010 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006011 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006012
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 e = p + PyUnicode_GET_SIZE(self);
6014 for (; p < e; p++) {
6015 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006016 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006018 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019}
6020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006021PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006022"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006023\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006024Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006025and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006026
6027static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006028unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006029{
6030 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6031 register const Py_UNICODE *e;
6032
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006033 /* Shortcut for single character strings */
6034 if (PyUnicode_GET_SIZE(self) == 1 &&
6035 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006036 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006037
6038 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006039 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006040 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006041
6042 e = p + PyUnicode_GET_SIZE(self);
6043 for (; p < e; p++) {
6044 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006045 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006046 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006047 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006048}
6049
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006050PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006051"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006052\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006053Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006054and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006055
6056static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006057unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006058{
6059 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6060 register const Py_UNICODE *e;
6061
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006062 /* Shortcut for single character strings */
6063 if (PyUnicode_GET_SIZE(self) == 1 &&
6064 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006065 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006066
6067 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006068 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006069 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006070
6071 e = p + PyUnicode_GET_SIZE(self);
6072 for (; p < e; p++) {
6073 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006074 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006075 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006076 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006077}
6078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006079PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006080"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006082Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006083False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084
6085static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006086unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087{
6088 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6089 register const Py_UNICODE *e;
6090
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 /* Shortcut for single character strings */
6092 if (PyUnicode_GET_SIZE(self) == 1 &&
6093 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006094 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006096 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006097 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006098 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006099
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100 e = p + PyUnicode_GET_SIZE(self);
6101 for (; p < e; p++) {
6102 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006103 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006105 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106}
6107
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006108PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006109"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006111Return True if all characters in S are digits\n\
6112and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113
6114static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006115unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116{
6117 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6118 register const Py_UNICODE *e;
6119
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 /* Shortcut for single character strings */
6121 if (PyUnicode_GET_SIZE(self) == 1 &&
6122 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006123 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006125 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006126 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006127 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006128
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 e = p + PyUnicode_GET_SIZE(self);
6130 for (; p < e; p++) {
6131 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006132 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006134 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135}
6136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006137PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006138"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006140Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006141False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142
6143static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006144unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145{
6146 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6147 register const Py_UNICODE *e;
6148
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 /* Shortcut for single character strings */
6150 if (PyUnicode_GET_SIZE(self) == 1 &&
6151 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006152 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006154 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006155 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006156 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006157
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158 e = p + PyUnicode_GET_SIZE(self);
6159 for (; p < e; p++) {
6160 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006161 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006163 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164}
6165
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006166PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167"S.join(sequence) -> unicode\n\
6168\n\
6169Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006170sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171
6172static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006173unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006175 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176}
6177
Martin v. Löwis18e16552006-02-15 17:27:45 +00006178static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179unicode_length(PyUnicodeObject *self)
6180{
6181 return self->length;
6182}
6183
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006184PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006185"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186\n\
6187Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006188done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189
6190static PyObject *
6191unicode_ljust(PyUnicodeObject *self, PyObject *args)
6192{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006193 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006194 Py_UNICODE fillchar = ' ';
6195
Martin v. Löwis412fb672006-04-13 06:34:32 +00006196 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 return NULL;
6198
Tim Peters7a29bd52001-09-12 03:03:31 +00006199 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 Py_INCREF(self);
6201 return (PyObject*) self;
6202 }
6203
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006204 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205}
6206
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006207PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208"S.lower() -> unicode\n\
6209\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006210Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211
6212static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006213unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 return fixup(self, fixlower);
6216}
6217
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006218#define LEFTSTRIP 0
6219#define RIGHTSTRIP 1
6220#define BOTHSTRIP 2
6221
6222/* Arrays indexed by above */
6223static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6224
6225#define STRIPNAME(i) (stripformat[i]+3)
6226
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006227/* externally visible for str.strip(unicode) */
6228PyObject *
6229_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6230{
6231 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006232 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006233 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006234 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6235 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006236
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006237 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6238
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006239 i = 0;
6240 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006241 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6242 i++;
6243 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006244 }
6245
6246 j = len;
6247 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006248 do {
6249 j--;
6250 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6251 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006252 }
6253
6254 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006255 Py_INCREF(self);
6256 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006257 }
6258 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006259 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006260}
6261
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262
6263static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006264do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006266 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006267 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006268
6269 i = 0;
6270 if (striptype != RIGHTSTRIP) {
6271 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6272 i++;
6273 }
6274 }
6275
6276 j = len;
6277 if (striptype != LEFTSTRIP) {
6278 do {
6279 j--;
6280 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6281 j++;
6282 }
6283
6284 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6285 Py_INCREF(self);
6286 return (PyObject*)self;
6287 }
6288 else
6289 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290}
6291
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006292
6293static PyObject *
6294do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6295{
6296 PyObject *sep = NULL;
6297
6298 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6299 return NULL;
6300
6301 if (sep != NULL && sep != Py_None) {
6302 if (PyUnicode_Check(sep))
6303 return _PyUnicode_XStrip(self, striptype, sep);
6304 else if (PyString_Check(sep)) {
6305 PyObject *res;
6306 sep = PyUnicode_FromObject(sep);
6307 if (sep==NULL)
6308 return NULL;
6309 res = _PyUnicode_XStrip(self, striptype, sep);
6310 Py_DECREF(sep);
6311 return res;
6312 }
6313 else {
6314 PyErr_Format(PyExc_TypeError,
6315 "%s arg must be None, unicode or str",
6316 STRIPNAME(striptype));
6317 return NULL;
6318 }
6319 }
6320
6321 return do_strip(self, striptype);
6322}
6323
6324
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006325PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006326"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006327\n\
6328Return a copy of the string S with leading and trailing\n\
6329whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006330If chars is given and not None, remove characters in chars instead.\n\
6331If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006332
6333static PyObject *
6334unicode_strip(PyUnicodeObject *self, PyObject *args)
6335{
6336 if (PyTuple_GET_SIZE(args) == 0)
6337 return do_strip(self, BOTHSTRIP); /* Common case */
6338 else
6339 return do_argstrip(self, BOTHSTRIP, args);
6340}
6341
6342
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006343PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006344"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006345\n\
6346Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006347If chars is given and not None, remove characters in chars instead.\n\
6348If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006349
6350static PyObject *
6351unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6352{
6353 if (PyTuple_GET_SIZE(args) == 0)
6354 return do_strip(self, LEFTSTRIP); /* Common case */
6355 else
6356 return do_argstrip(self, LEFTSTRIP, args);
6357}
6358
6359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006360PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006361"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006362\n\
6363Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006364If chars is given and not None, remove characters in chars instead.\n\
6365If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006366
6367static PyObject *
6368unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6369{
6370 if (PyTuple_GET_SIZE(args) == 0)
6371 return do_strip(self, RIGHTSTRIP); /* Common case */
6372 else
6373 return do_argstrip(self, RIGHTSTRIP, args);
6374}
6375
6376
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006378unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379{
6380 PyUnicodeObject *u;
6381 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006382 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006383 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384
6385 if (len < 0)
6386 len = 0;
6387
Tim Peters7a29bd52001-09-12 03:03:31 +00006388 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389 /* no repeat, return original string */
6390 Py_INCREF(str);
6391 return (PyObject*) str;
6392 }
Tim Peters8f422462000-09-09 06:13:41 +00006393
6394 /* ensure # of chars needed doesn't overflow int and # of bytes
6395 * needed doesn't overflow size_t
6396 */
6397 nchars = len * str->length;
6398 if (len && nchars / len != str->length) {
6399 PyErr_SetString(PyExc_OverflowError,
6400 "repeated string is too long");
6401 return NULL;
6402 }
6403 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6404 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6405 PyErr_SetString(PyExc_OverflowError,
6406 "repeated string is too long");
6407 return NULL;
6408 }
6409 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 if (!u)
6411 return NULL;
6412
6413 p = u->str;
6414
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006415 if (str->length == 1 && len > 0) {
6416 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006417 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006418 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006419 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006420 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006421 done = str->length;
6422 }
6423 while (done < nchars) {
6424 int n = (done <= nchars-done) ? done : nchars-done;
6425 Py_UNICODE_COPY(p+done, p, n);
6426 done += n;
6427 }
6428 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429
6430 return (PyObject*) u;
6431}
6432
6433PyObject *PyUnicode_Replace(PyObject *obj,
6434 PyObject *subobj,
6435 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006436 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437{
6438 PyObject *self;
6439 PyObject *str1;
6440 PyObject *str2;
6441 PyObject *result;
6442
6443 self = PyUnicode_FromObject(obj);
6444 if (self == NULL)
6445 return NULL;
6446 str1 = PyUnicode_FromObject(subobj);
6447 if (str1 == NULL) {
6448 Py_DECREF(self);
6449 return NULL;
6450 }
6451 str2 = PyUnicode_FromObject(replobj);
6452 if (str2 == NULL) {
6453 Py_DECREF(self);
6454 Py_DECREF(str1);
6455 return NULL;
6456 }
Tim Petersced69f82003-09-16 20:30:58 +00006457 result = replace((PyUnicodeObject *)self,
6458 (PyUnicodeObject *)str1,
6459 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 maxcount);
6461 Py_DECREF(self);
6462 Py_DECREF(str1);
6463 Py_DECREF(str2);
6464 return result;
6465}
6466
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006467PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468"S.replace (old, new[, maxsplit]) -> unicode\n\
6469\n\
6470Return a copy of S with all occurrences of substring\n\
6471old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006472given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473
6474static PyObject*
6475unicode_replace(PyUnicodeObject *self, PyObject *args)
6476{
6477 PyUnicodeObject *str1;
6478 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006479 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 PyObject *result;
6481
Martin v. Löwis18e16552006-02-15 17:27:45 +00006482 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483 return NULL;
6484 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6485 if (str1 == NULL)
6486 return NULL;
6487 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006488 if (str2 == NULL) {
6489 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006491 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492
6493 result = replace(self, str1, str2, maxcount);
6494
6495 Py_DECREF(str1);
6496 Py_DECREF(str2);
6497 return result;
6498}
6499
6500static
6501PyObject *unicode_repr(PyObject *unicode)
6502{
6503 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6504 PyUnicode_GET_SIZE(unicode),
6505 1);
6506}
6507
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006508PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509"S.rfind(sub [,start [,end]]) -> int\n\
6510\n\
6511Return the highest index in S where substring sub is found,\n\
Georg Brandlb4d100c2007-07-29 17:37:22 +00006512such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513arguments start and end are interpreted as in slice notation.\n\
6514\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006515Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516
6517static PyObject *
6518unicode_rfind(PyUnicodeObject *self, PyObject *args)
6519{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006520 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006521 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006522 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006523 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524
Guido van Rossumb8872e62000-05-09 14:14:27 +00006525 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6526 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006528 substring = PyUnicode_FromObject(substring);
6529 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 return NULL;
6531
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006532 result = stringlib_rfind_slice(
6533 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6534 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6535 start, end
6536 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537
6538 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006539
6540 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541}
6542
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006543PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544"S.rindex(sub [,start [,end]]) -> int\n\
6545\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006546Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547
6548static PyObject *
6549unicode_rindex(PyUnicodeObject *self, PyObject *args)
6550{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006551 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006552 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006553 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006554 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555
Guido van Rossumb8872e62000-05-09 14:14:27 +00006556 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6557 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006559 substring = PyUnicode_FromObject(substring);
6560 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 return NULL;
6562
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006563 result = stringlib_rfind_slice(
6564 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6565 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6566 start, end
6567 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568
6569 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006570
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571 if (result < 0) {
6572 PyErr_SetString(PyExc_ValueError, "substring not found");
6573 return NULL;
6574 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006575 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576}
6577
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006578PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006579"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580\n\
6581Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006582done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583
6584static PyObject *
6585unicode_rjust(PyUnicodeObject *self, PyObject *args)
6586{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006587 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006588 Py_UNICODE fillchar = ' ';
6589
Martin v. Löwis412fb672006-04-13 06:34:32 +00006590 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 return NULL;
6592
Tim Peters7a29bd52001-09-12 03:03:31 +00006593 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594 Py_INCREF(self);
6595 return (PyObject*) self;
6596 }
6597
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006598 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599}
6600
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006602unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603{
6604 /* standard clamping */
6605 if (start < 0)
6606 start = 0;
6607 if (end < 0)
6608 end = 0;
6609 if (end > self->length)
6610 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006611 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 /* full slice, return original string */
6613 Py_INCREF(self);
6614 return (PyObject*) self;
6615 }
6616 if (start > end)
6617 start = end;
6618 /* copy slice */
6619 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6620 end - start);
6621}
6622
6623PyObject *PyUnicode_Split(PyObject *s,
6624 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006625 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626{
6627 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006628
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 s = PyUnicode_FromObject(s);
6630 if (s == NULL)
6631 return NULL;
6632 if (sep != NULL) {
6633 sep = PyUnicode_FromObject(sep);
6634 if (sep == NULL) {
6635 Py_DECREF(s);
6636 return NULL;
6637 }
6638 }
6639
6640 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6641
6642 Py_DECREF(s);
6643 Py_XDECREF(sep);
6644 return result;
6645}
6646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006647PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648"S.split([sep [,maxsplit]]) -> list of strings\n\
6649\n\
6650Return a list of the words in S, using sep as the\n\
6651delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006652splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006653any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654
6655static PyObject*
6656unicode_split(PyUnicodeObject *self, PyObject *args)
6657{
6658 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006659 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660
Martin v. Löwis18e16552006-02-15 17:27:45 +00006661 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 return NULL;
6663
6664 if (substring == Py_None)
6665 return split(self, NULL, maxcount);
6666 else if (PyUnicode_Check(substring))
6667 return split(self, (PyUnicodeObject *)substring, maxcount);
6668 else
6669 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6670}
6671
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006672PyObject *
6673PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6674{
6675 PyObject* str_obj;
6676 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006677 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00006678
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006679 str_obj = PyUnicode_FromObject(str_in);
6680 if (!str_obj)
6681 return NULL;
6682 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00006683 if (!sep_obj) {
6684 Py_DECREF(str_obj);
6685 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006686 }
6687
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006688 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00006689 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6690 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6691 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006692
Fredrik Lundhb9479482006-05-26 17:22:38 +00006693 Py_DECREF(sep_obj);
6694 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006695
6696 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006697}
6698
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006699
6700PyObject *
6701PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6702{
6703 PyObject* str_obj;
6704 PyObject* sep_obj;
6705 PyObject* out;
6706
6707 str_obj = PyUnicode_FromObject(str_in);
6708 if (!str_obj)
6709 return NULL;
6710 sep_obj = PyUnicode_FromObject(sep_in);
6711 if (!sep_obj) {
6712 Py_DECREF(str_obj);
6713 return NULL;
6714 }
6715
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006716 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006717 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6718 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6719 );
6720
6721 Py_DECREF(sep_obj);
6722 Py_DECREF(str_obj);
6723
6724 return out;
6725}
6726
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006727PyDoc_STRVAR(partition__doc__,
6728"S.partition(sep) -> (head, sep, tail)\n\
6729\n\
6730Searches for the separator sep in S, and returns the part before it,\n\
6731the separator itself, and the part after it. If the separator is not\n\
6732found, returns S and two empty strings.");
6733
6734static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00006735unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006736{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006737 return PyUnicode_Partition((PyObject *)self, separator);
6738}
6739
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006740PyDoc_STRVAR(rpartition__doc__,
Neal Norwitz29a5fdb2006-09-05 02:21:38 +00006741"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006742\n\
6743Searches for the separator sep in S, starting at the end of S, and returns\n\
6744the part before it, the separator itself, and the part after it. If the\n\
Neal Norwitz29a5fdb2006-09-05 02:21:38 +00006745separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006746
6747static PyObject*
6748unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6749{
6750 return PyUnicode_RPartition((PyObject *)self, separator);
6751}
6752
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006753PyObject *PyUnicode_RSplit(PyObject *s,
6754 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006755 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006756{
6757 PyObject *result;
6758
6759 s = PyUnicode_FromObject(s);
6760 if (s == NULL)
6761 return NULL;
6762 if (sep != NULL) {
6763 sep = PyUnicode_FromObject(sep);
6764 if (sep == NULL) {
6765 Py_DECREF(s);
6766 return NULL;
6767 }
6768 }
6769
6770 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6771
6772 Py_DECREF(s);
6773 Py_XDECREF(sep);
6774 return result;
6775}
6776
6777PyDoc_STRVAR(rsplit__doc__,
6778"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6779\n\
6780Return a list of the words in S, using sep as the\n\
6781delimiter string, starting at the end of the string and\n\
6782working to the front. If maxsplit is given, at most maxsplit\n\
6783splits are done. If sep is not specified, any whitespace string\n\
6784is a separator.");
6785
6786static PyObject*
6787unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6788{
6789 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006790 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006791
Martin v. Löwis18e16552006-02-15 17:27:45 +00006792 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006793 return NULL;
6794
6795 if (substring == Py_None)
6796 return rsplit(self, NULL, maxcount);
6797 else if (PyUnicode_Check(substring))
6798 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6799 else
6800 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6801}
6802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006803PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006804"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805\n\
6806Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006807Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006808is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809
6810static PyObject*
6811unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6812{
Guido van Rossum86662912000-04-11 15:38:46 +00006813 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814
Guido van Rossum86662912000-04-11 15:38:46 +00006815 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816 return NULL;
6817
Guido van Rossum86662912000-04-11 15:38:46 +00006818 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819}
6820
6821static
6822PyObject *unicode_str(PyUnicodeObject *self)
6823{
Fred Drakee4315f52000-05-09 19:53:39 +00006824 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825}
6826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006827PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828"S.swapcase() -> unicode\n\
6829\n\
6830Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006831and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832
6833static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006834unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836 return fixup(self, fixswapcase);
6837}
6838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006839PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840"S.translate(table) -> unicode\n\
6841\n\
6842Return a copy of the string S, where all characters have been mapped\n\
6843through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006844Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6845Unmapped characters are left untouched. Characters mapped to None\n\
6846are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847
6848static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006849unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850{
Tim Petersced69f82003-09-16 20:30:58 +00006851 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006853 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 "ignore");
6855}
6856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006857PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858"S.upper() -> unicode\n\
6859\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006860Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861
6862static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006863unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865 return fixup(self, fixupper);
6866}
6867
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006868PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869"S.zfill(width) -> unicode\n\
6870\n\
6871Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006872of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873
6874static PyObject *
6875unicode_zfill(PyUnicodeObject *self, PyObject *args)
6876{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006877 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878 PyUnicodeObject *u;
6879
Martin v. Löwis18e16552006-02-15 17:27:45 +00006880 Py_ssize_t width;
6881 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882 return NULL;
6883
6884 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006885 if (PyUnicode_CheckExact(self)) {
6886 Py_INCREF(self);
6887 return (PyObject*) self;
6888 }
6889 else
6890 return PyUnicode_FromUnicode(
6891 PyUnicode_AS_UNICODE(self),
6892 PyUnicode_GET_SIZE(self)
6893 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894 }
6895
6896 fill = width - self->length;
6897
6898 u = pad(self, fill, 0, '0');
6899
Walter Dörwald068325e2002-04-15 13:36:47 +00006900 if (u == NULL)
6901 return NULL;
6902
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903 if (u->str[fill] == '+' || u->str[fill] == '-') {
6904 /* move sign to beginning of string */
6905 u->str[0] = u->str[fill];
6906 u->str[fill] = '0';
6907 }
6908
6909 return (PyObject*) u;
6910}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911
6912#if 0
6913static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006914unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 return PyInt_FromLong(unicode_freelist_size);
6917}
6918#endif
6919
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006920PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006921"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006923Return True if S starts with the specified prefix, False otherwise.\n\
6924With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00006925With optional end, stop comparing S at that position.\n\
6926prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927
6928static PyObject *
6929unicode_startswith(PyUnicodeObject *self,
6930 PyObject *args)
6931{
Georg Brandl24250812006-06-09 18:45:48 +00006932 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006934 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006935 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00006936 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937
Georg Brandl24250812006-06-09 18:45:48 +00006938 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00006939 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00006941 if (PyTuple_Check(subobj)) {
6942 Py_ssize_t i;
6943 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6944 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6945 PyTuple_GET_ITEM(subobj, i));
6946 if (substring == NULL)
6947 return NULL;
6948 result = tailmatch(self, substring, start, end, -1);
6949 Py_DECREF(substring);
6950 if (result) {
6951 Py_RETURN_TRUE;
6952 }
6953 }
6954 /* nothing matched */
6955 Py_RETURN_FALSE;
6956 }
6957 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00006959 return NULL;
6960 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00006962 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963}
6964
6965
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006966PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006967"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006969Return True if S ends with the specified suffix, False otherwise.\n\
6970With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00006971With optional end, stop comparing S at that position.\n\
6972suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973
6974static PyObject *
6975unicode_endswith(PyUnicodeObject *self,
6976 PyObject *args)
6977{
Georg Brandl24250812006-06-09 18:45:48 +00006978 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006980 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006981 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00006982 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983
Georg Brandl24250812006-06-09 18:45:48 +00006984 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
6985 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00006987 if (PyTuple_Check(subobj)) {
6988 Py_ssize_t i;
6989 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6990 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6991 PyTuple_GET_ITEM(subobj, i));
6992 if (substring == NULL)
6993 return NULL;
6994 result = tailmatch(self, substring, start, end, +1);
6995 Py_DECREF(substring);
6996 if (result) {
6997 Py_RETURN_TRUE;
6998 }
6999 }
7000 Py_RETURN_FALSE;
7001 }
7002 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007004 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005
Georg Brandl24250812006-06-09 18:45:48 +00007006 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007008 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009}
7010
7011
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007012
7013static PyObject *
7014unicode_getnewargs(PyUnicodeObject *v)
7015{
7016 return Py_BuildValue("(u#)", v->str, v->length);
7017}
7018
7019
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020static PyMethodDef unicode_methods[] = {
7021
7022 /* Order is according to common usage: often used methods should
7023 appear first, since lookup is done sequentially. */
7024
Georg Brandlecdc0a92006-03-30 12:19:07 +00007025 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007026 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7027 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007028 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007029 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7030 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7031 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7032 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7033 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7034 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7035 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007036 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007037 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7038 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7039 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007040 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007041 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007042/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7043 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7044 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7045 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007046 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007047 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007048 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007049 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007050 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7051 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7052 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7053 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7054 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7055 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7056 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7057 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7058 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7059 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7060 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7061 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7062 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7063 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007064 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007065#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007066 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067#endif
7068
7069#if 0
7070 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007071 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072#endif
7073
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007074 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075 {NULL, NULL}
7076};
7077
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007078static PyObject *
7079unicode_mod(PyObject *v, PyObject *w)
7080{
7081 if (!PyUnicode_Check(v)) {
7082 Py_INCREF(Py_NotImplemented);
7083 return Py_NotImplemented;
7084 }
7085 return PyUnicode_Format(v, w);
7086}
7087
7088static PyNumberMethods unicode_as_number = {
7089 0, /*nb_add*/
7090 0, /*nb_subtract*/
7091 0, /*nb_multiply*/
7092 0, /*nb_divide*/
7093 unicode_mod, /*nb_remainder*/
7094};
7095
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007097 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007098 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007099 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7100 (ssizeargfunc) unicode_getitem, /* sq_item */
7101 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102 0, /* sq_ass_item */
7103 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007104 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105};
7106
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007107static PyObject*
7108unicode_subscript(PyUnicodeObject* self, PyObject* item)
7109{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007110 if (PyIndex_Check(item)) {
7111 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007112 if (i == -1 && PyErr_Occurred())
7113 return NULL;
7114 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007115 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007116 return unicode_getitem(self, i);
7117 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007118 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007119 Py_UNICODE* source_buf;
7120 Py_UNICODE* result_buf;
7121 PyObject* result;
7122
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007123 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007124 &start, &stop, &step, &slicelength) < 0) {
7125 return NULL;
7126 }
7127
7128 if (slicelength <= 0) {
7129 return PyUnicode_FromUnicode(NULL, 0);
7130 } else {
7131 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00007132 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7133 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007134
7135 if (result_buf == NULL)
7136 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007137
7138 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7139 result_buf[i] = source_buf[cur];
7140 }
Tim Petersced69f82003-09-16 20:30:58 +00007141
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007142 result = PyUnicode_FromUnicode(result_buf, slicelength);
7143 PyMem_FREE(result_buf);
7144 return result;
7145 }
7146 } else {
7147 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7148 return NULL;
7149 }
7150}
7151
7152static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007153 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007154 (binaryfunc)unicode_subscript, /* mp_subscript */
7155 (objobjargproc)0, /* mp_ass_subscript */
7156};
7157
Martin v. Löwis18e16552006-02-15 17:27:45 +00007158static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007160 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161 const void **ptr)
7162{
7163 if (index != 0) {
7164 PyErr_SetString(PyExc_SystemError,
7165 "accessing non-existent unicode segment");
7166 return -1;
7167 }
7168 *ptr = (void *) self->str;
7169 return PyUnicode_GET_DATA_SIZE(self);
7170}
7171
Martin v. Löwis18e16552006-02-15 17:27:45 +00007172static Py_ssize_t
7173unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174 const void **ptr)
7175{
7176 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007177 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178 return -1;
7179}
7180
7181static int
7182unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007183 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184{
7185 if (lenp)
7186 *lenp = PyUnicode_GET_DATA_SIZE(self);
7187 return 1;
7188}
7189
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007190static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007192 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193 const void **ptr)
7194{
7195 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007196
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197 if (index != 0) {
7198 PyErr_SetString(PyExc_SystemError,
7199 "accessing non-existent unicode segment");
7200 return -1;
7201 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007202 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203 if (str == NULL)
7204 return -1;
7205 *ptr = (void *) PyString_AS_STRING(str);
7206 return PyString_GET_SIZE(str);
7207}
7208
7209/* Helpers for PyUnicode_Format() */
7210
7211static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007212getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007214 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 if (argidx < arglen) {
7216 (*p_argidx)++;
7217 if (arglen < 0)
7218 return args;
7219 else
7220 return PyTuple_GetItem(args, argidx);
7221 }
7222 PyErr_SetString(PyExc_TypeError,
7223 "not enough arguments for format string");
7224 return NULL;
7225}
7226
7227#define F_LJUST (1<<0)
7228#define F_SIGN (1<<1)
7229#define F_BLANK (1<<2)
7230#define F_ALT (1<<3)
7231#define F_ZERO (1<<4)
7232
Martin v. Löwis18e16552006-02-15 17:27:45 +00007233static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007234strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007236 register Py_ssize_t i;
7237 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238 for (i = len - 1; i >= 0; i--)
7239 buffer[i] = (Py_UNICODE) charbuffer[i];
7240
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241 return len;
7242}
7243
Neal Norwitzfc76d632006-01-10 06:03:13 +00007244static int
7245doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7246{
Tim Peters15231542006-02-16 01:08:01 +00007247 Py_ssize_t result;
7248
Neal Norwitzfc76d632006-01-10 06:03:13 +00007249 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007250 result = strtounicode(buffer, (char *)buffer);
7251 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007252}
7253
7254static int
7255longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7256{
Tim Peters15231542006-02-16 01:08:01 +00007257 Py_ssize_t result;
7258
Neal Norwitzfc76d632006-01-10 06:03:13 +00007259 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007260 result = strtounicode(buffer, (char *)buffer);
7261 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007262}
7263
Guido van Rossum078151d2002-08-11 04:24:12 +00007264/* XXX To save some code duplication, formatfloat/long/int could have been
7265 shared with stringobject.c, converting from 8-bit to Unicode after the
7266 formatting is done. */
7267
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268static int
7269formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007270 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271 int flags,
7272 int prec,
7273 int type,
7274 PyObject *v)
7275{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007276 /* fmt = '%#.' + `prec` + `type`
7277 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278 char fmt[20];
7279 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007280
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281 x = PyFloat_AsDouble(v);
7282 if (x == -1.0 && PyErr_Occurred())
7283 return -1;
7284 if (prec < 0)
7285 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7287 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007288 /* Worst case length calc to ensure no buffer overrun:
7289
7290 'g' formats:
7291 fmt = %#.<prec>g
7292 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7293 for any double rep.)
7294 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7295
7296 'f' formats:
7297 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7298 len = 1 + 50 + 1 + prec = 52 + prec
7299
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007300 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007301 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007302
7303 */
Georg Brandlc5db9232007-07-12 08:38:04 +00007304 if (((type == 'g' || type == 'G') &&
7305 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007306 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007307 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007308 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007309 return -1;
7310 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007311 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7312 (flags&F_ALT) ? "#" : "",
7313 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007314 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315}
7316
Tim Peters38fd5b62000-09-21 05:43:11 +00007317static PyObject*
7318formatlong(PyObject *val, int flags, int prec, int type)
7319{
7320 char *buf;
7321 int i, len;
7322 PyObject *str; /* temporary string object. */
7323 PyUnicodeObject *result;
7324
7325 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7326 if (!str)
7327 return NULL;
7328 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007329 if (!result) {
7330 Py_DECREF(str);
7331 return NULL;
7332 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007333 for (i = 0; i < len; i++)
7334 result->str[i] = buf[i];
7335 result->str[len] = 0;
7336 Py_DECREF(str);
7337 return (PyObject*)result;
7338}
7339
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340static int
7341formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007342 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343 int flags,
7344 int prec,
7345 int type,
7346 PyObject *v)
7347{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007348 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007349 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7350 * + 1 + 1
7351 * = 24
7352 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007353 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007354 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355 long x;
7356
7357 x = PyInt_AsLong(v);
7358 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007359 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007360 if (x < 0 && type == 'u') {
7361 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007362 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007363 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7364 sign = "-";
7365 else
7366 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007368 prec = 1;
7369
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007370 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7371 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007372 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007373 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007374 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007375 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007376 return -1;
7377 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007378
7379 if ((flags & F_ALT) &&
7380 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007381 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007382 * of issues that cause pain:
7383 * - when 0 is being converted, the C standard leaves off
7384 * the '0x' or '0X', which is inconsistent with other
7385 * %#x/%#X conversions and inconsistent with Python's
7386 * hex() function
7387 * - there are platforms that violate the standard and
7388 * convert 0 with the '0x' or '0X'
7389 * (Metrowerks, Compaq Tru64)
7390 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007391 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007392 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007393 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007394 * We can achieve the desired consistency by inserting our
7395 * own '0x' or '0X' prefix, and substituting %x/%X in place
7396 * of %#x/%#X.
7397 *
7398 * Note that this is the same approach as used in
7399 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007400 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007401 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7402 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007403 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007404 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007405 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7406 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007407 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007408 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007409 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007410 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007411 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007412 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413}
7414
7415static int
7416formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007417 size_t buflen,
7418 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007420 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007421 if (PyUnicode_Check(v)) {
7422 if (PyUnicode_GET_SIZE(v) != 1)
7423 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007425 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007427 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007428 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007429 goto onError;
7430 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7431 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432
7433 else {
7434 /* Integer input truncated to a character */
7435 long x;
7436 x = PyInt_AsLong(v);
7437 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007438 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007439#ifdef Py_UNICODE_WIDE
7440 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007441 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007442 "%c arg not in range(0x110000) "
7443 "(wide Python build)");
7444 return -1;
7445 }
7446#else
7447 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007448 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007449 "%c arg not in range(0x10000) "
7450 "(narrow Python build)");
7451 return -1;
7452 }
7453#endif
7454 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455 }
7456 buf[1] = '\0';
7457 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007458
7459 onError:
7460 PyErr_SetString(PyExc_TypeError,
7461 "%c requires int or char");
7462 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463}
7464
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007465/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7466
7467 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7468 chars are formatted. XXX This is a magic number. Each formatting
7469 routine does bounds checking to ensure no overflow, but a better
7470 solution may be to malloc a buffer of appropriate size for each
7471 format. For now, the current solution is sufficient.
7472*/
7473#define FORMATBUFLEN (size_t)120
7474
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475PyObject *PyUnicode_Format(PyObject *format,
7476 PyObject *args)
7477{
7478 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007479 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480 int args_owned = 0;
7481 PyUnicodeObject *result = NULL;
7482 PyObject *dict = NULL;
7483 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007484
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485 if (format == NULL || args == NULL) {
7486 PyErr_BadInternalCall();
7487 return NULL;
7488 }
7489 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007490 if (uformat == NULL)
7491 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492 fmt = PyUnicode_AS_UNICODE(uformat);
7493 fmtcnt = PyUnicode_GET_SIZE(uformat);
7494
7495 reslen = rescnt = fmtcnt + 100;
7496 result = _PyUnicode_New(reslen);
7497 if (result == NULL)
7498 goto onError;
7499 res = PyUnicode_AS_UNICODE(result);
7500
7501 if (PyTuple_Check(args)) {
7502 arglen = PyTuple_Size(args);
7503 argidx = 0;
7504 }
7505 else {
7506 arglen = -1;
7507 argidx = -2;
7508 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007509 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7510 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511 dict = args;
7512
7513 while (--fmtcnt >= 0) {
7514 if (*fmt != '%') {
7515 if (--rescnt < 0) {
7516 rescnt = fmtcnt + 100;
7517 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007518 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007519 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7521 --rescnt;
7522 }
7523 *res++ = *fmt++;
7524 }
7525 else {
7526 /* Got a format specifier */
7527 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007528 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530 Py_UNICODE c = '\0';
7531 Py_UNICODE fill;
7532 PyObject *v = NULL;
7533 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007534 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007536 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007537 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538
7539 fmt++;
7540 if (*fmt == '(') {
7541 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007542 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543 PyObject *key;
7544 int pcount = 1;
7545
7546 if (dict == NULL) {
7547 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007548 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 goto onError;
7550 }
7551 ++fmt;
7552 --fmtcnt;
7553 keystart = fmt;
7554 /* Skip over balanced parentheses */
7555 while (pcount > 0 && --fmtcnt >= 0) {
7556 if (*fmt == ')')
7557 --pcount;
7558 else if (*fmt == '(')
7559 ++pcount;
7560 fmt++;
7561 }
7562 keylen = fmt - keystart - 1;
7563 if (fmtcnt < 0 || pcount > 0) {
7564 PyErr_SetString(PyExc_ValueError,
7565 "incomplete format key");
7566 goto onError;
7567 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007568#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007569 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570 then looked up since Python uses strings to hold
7571 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007572 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573 key = PyUnicode_EncodeUTF8(keystart,
7574 keylen,
7575 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007576#else
7577 key = PyUnicode_FromUnicode(keystart, keylen);
7578#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579 if (key == NULL)
7580 goto onError;
7581 if (args_owned) {
7582 Py_DECREF(args);
7583 args_owned = 0;
7584 }
7585 args = PyObject_GetItem(dict, key);
7586 Py_DECREF(key);
7587 if (args == NULL) {
7588 goto onError;
7589 }
7590 args_owned = 1;
7591 arglen = -1;
7592 argidx = -2;
7593 }
7594 while (--fmtcnt >= 0) {
7595 switch (c = *fmt++) {
7596 case '-': flags |= F_LJUST; continue;
7597 case '+': flags |= F_SIGN; continue;
7598 case ' ': flags |= F_BLANK; continue;
7599 case '#': flags |= F_ALT; continue;
7600 case '0': flags |= F_ZERO; continue;
7601 }
7602 break;
7603 }
7604 if (c == '*') {
7605 v = getnextarg(args, arglen, &argidx);
7606 if (v == NULL)
7607 goto onError;
7608 if (!PyInt_Check(v)) {
7609 PyErr_SetString(PyExc_TypeError,
7610 "* wants int");
7611 goto onError;
7612 }
7613 width = PyInt_AsLong(v);
7614 if (width < 0) {
7615 flags |= F_LJUST;
7616 width = -width;
7617 }
7618 if (--fmtcnt >= 0)
7619 c = *fmt++;
7620 }
7621 else if (c >= '0' && c <= '9') {
7622 width = c - '0';
7623 while (--fmtcnt >= 0) {
7624 c = *fmt++;
7625 if (c < '0' || c > '9')
7626 break;
7627 if ((width*10) / 10 != width) {
7628 PyErr_SetString(PyExc_ValueError,
7629 "width too big");
7630 goto onError;
7631 }
7632 width = width*10 + (c - '0');
7633 }
7634 }
7635 if (c == '.') {
7636 prec = 0;
7637 if (--fmtcnt >= 0)
7638 c = *fmt++;
7639 if (c == '*') {
7640 v = getnextarg(args, arglen, &argidx);
7641 if (v == NULL)
7642 goto onError;
7643 if (!PyInt_Check(v)) {
7644 PyErr_SetString(PyExc_TypeError,
7645 "* wants int");
7646 goto onError;
7647 }
7648 prec = PyInt_AsLong(v);
7649 if (prec < 0)
7650 prec = 0;
7651 if (--fmtcnt >= 0)
7652 c = *fmt++;
7653 }
7654 else if (c >= '0' && c <= '9') {
7655 prec = c - '0';
7656 while (--fmtcnt >= 0) {
7657 c = Py_CHARMASK(*fmt++);
7658 if (c < '0' || c > '9')
7659 break;
7660 if ((prec*10) / 10 != prec) {
7661 PyErr_SetString(PyExc_ValueError,
7662 "prec too big");
7663 goto onError;
7664 }
7665 prec = prec*10 + (c - '0');
7666 }
7667 }
7668 } /* prec */
7669 if (fmtcnt >= 0) {
7670 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671 if (--fmtcnt >= 0)
7672 c = *fmt++;
7673 }
7674 }
7675 if (fmtcnt < 0) {
7676 PyErr_SetString(PyExc_ValueError,
7677 "incomplete format");
7678 goto onError;
7679 }
7680 if (c != '%') {
7681 v = getnextarg(args, arglen, &argidx);
7682 if (v == NULL)
7683 goto onError;
7684 }
7685 sign = 0;
7686 fill = ' ';
7687 switch (c) {
7688
7689 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007690 pbuf = formatbuf;
7691 /* presume that buffer length is at least 1 */
7692 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693 len = 1;
7694 break;
7695
7696 case 's':
7697 case 'r':
7698 if (PyUnicode_Check(v) && c == 's') {
7699 temp = v;
7700 Py_INCREF(temp);
7701 }
7702 else {
7703 PyObject *unicode;
7704 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007705 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706 else
7707 temp = PyObject_Repr(v);
7708 if (temp == NULL)
7709 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007710 if (PyUnicode_Check(temp))
7711 /* nothing to do */;
7712 else if (PyString_Check(temp)) {
7713 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007714 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007716 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007718 Py_DECREF(temp);
7719 temp = unicode;
7720 if (temp == NULL)
7721 goto onError;
7722 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007723 else {
7724 Py_DECREF(temp);
7725 PyErr_SetString(PyExc_TypeError,
7726 "%s argument has non-string str()");
7727 goto onError;
7728 }
7729 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007730 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731 len = PyUnicode_GET_SIZE(temp);
7732 if (prec >= 0 && len > prec)
7733 len = prec;
7734 break;
7735
7736 case 'i':
7737 case 'd':
7738 case 'u':
7739 case 'o':
7740 case 'x':
7741 case 'X':
7742 if (c == 'i')
7743 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007744 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007745 temp = formatlong(v, flags, prec, c);
7746 if (!temp)
7747 goto onError;
7748 pbuf = PyUnicode_AS_UNICODE(temp);
7749 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007750 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007752 else {
7753 pbuf = formatbuf;
7754 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7755 flags, prec, c, v);
7756 if (len < 0)
7757 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007758 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007759 }
7760 if (flags & F_ZERO)
7761 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762 break;
7763
7764 case 'e':
7765 case 'E':
7766 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007767 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768 case 'g':
7769 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007770 if (c == 'F')
7771 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007772 pbuf = formatbuf;
7773 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7774 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775 if (len < 0)
7776 goto onError;
7777 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007778 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779 fill = '0';
7780 break;
7781
7782 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007783 pbuf = formatbuf;
7784 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785 if (len < 0)
7786 goto onError;
7787 break;
7788
7789 default:
7790 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007791 "unsupported format character '%c' (0x%x) "
Armin Rigo4b63c212006-10-04 11:44:06 +00007792 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007793 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007794 (int)c,
Armin Rigo4b63c212006-10-04 11:44:06 +00007795 (Py_ssize_t)(fmt - 1 -
7796 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797 goto onError;
7798 }
7799 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007800 if (*pbuf == '-' || *pbuf == '+') {
7801 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 len--;
7803 }
7804 else if (flags & F_SIGN)
7805 sign = '+';
7806 else if (flags & F_BLANK)
7807 sign = ' ';
7808 else
7809 sign = 0;
7810 }
7811 if (width < len)
7812 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007813 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814 reslen -= rescnt;
7815 rescnt = width + fmtcnt + 100;
7816 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007817 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007818 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007819 PyErr_NoMemory();
7820 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007821 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007822 if (_PyUnicode_Resize(&result, reslen) < 0) {
7823 Py_XDECREF(temp);
7824 goto onError;
7825 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826 res = PyUnicode_AS_UNICODE(result)
7827 + reslen - rescnt;
7828 }
7829 if (sign) {
7830 if (fill != ' ')
7831 *res++ = sign;
7832 rescnt--;
7833 if (width > len)
7834 width--;
7835 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007836 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7837 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007838 assert(pbuf[1] == c);
7839 if (fill != ' ') {
7840 *res++ = *pbuf++;
7841 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007842 }
Tim Petersfff53252001-04-12 18:38:48 +00007843 rescnt -= 2;
7844 width -= 2;
7845 if (width < 0)
7846 width = 0;
7847 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007848 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 if (width > len && !(flags & F_LJUST)) {
7850 do {
7851 --rescnt;
7852 *res++ = fill;
7853 } while (--width > len);
7854 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007855 if (fill == ' ') {
7856 if (sign)
7857 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007858 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007859 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007860 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007861 *res++ = *pbuf++;
7862 *res++ = *pbuf++;
7863 }
7864 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007865 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866 res += len;
7867 rescnt -= len;
7868 while (--width >= len) {
7869 --rescnt;
7870 *res++ = ' ';
7871 }
7872 if (dict && (argidx < arglen) && c != '%') {
7873 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007874 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007875 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876 goto onError;
7877 }
7878 Py_XDECREF(temp);
7879 } /* '%' */
7880 } /* until end */
7881 if (argidx < arglen && !dict) {
7882 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007883 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884 goto onError;
7885 }
7886
Thomas Woutersa96affe2006-03-12 00:29:36 +00007887 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7888 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007889 if (args_owned) {
7890 Py_DECREF(args);
7891 }
7892 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007893 return (PyObject *)result;
7894
7895 onError:
7896 Py_XDECREF(result);
7897 Py_DECREF(uformat);
7898 if (args_owned) {
7899 Py_DECREF(args);
7900 }
7901 return NULL;
7902}
7903
7904static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007905 (readbufferproc) unicode_buffer_getreadbuf,
7906 (writebufferproc) unicode_buffer_getwritebuf,
7907 (segcountproc) unicode_buffer_getsegcount,
7908 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909};
7910
Jeremy Hylton938ace62002-07-17 16:30:39 +00007911static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007912unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7913
Tim Peters6d6c1a32001-08-02 04:15:00 +00007914static PyObject *
7915unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7916{
7917 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007918 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007919 char *encoding = NULL;
7920 char *errors = NULL;
7921
Guido van Rossume023fe02001-08-30 03:12:59 +00007922 if (type != &PyUnicode_Type)
7923 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007924 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7925 kwlist, &x, &encoding, &errors))
7926 return NULL;
7927 if (x == NULL)
7928 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007929 if (encoding == NULL && errors == NULL)
7930 return PyObject_Unicode(x);
7931 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007932 return PyUnicode_FromEncodedObject(x, encoding, errors);
7933}
7934
Guido van Rossume023fe02001-08-30 03:12:59 +00007935static PyObject *
7936unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7937{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007938 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007939 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007940
7941 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7942 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7943 if (tmp == NULL)
7944 return NULL;
7945 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007946 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007947 if (pnew == NULL) {
7948 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007949 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007950 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007951 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7952 if (pnew->str == NULL) {
7953 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007954 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007955 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007956 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007957 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007958 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7959 pnew->length = n;
7960 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007961 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007962 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007963}
7964
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007965PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007966"unicode(string [, encoding[, errors]]) -> object\n\
7967\n\
7968Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007969encoding defaults to the current default string encoding.\n\
7970errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007971
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972PyTypeObject PyUnicode_Type = {
7973 PyObject_HEAD_INIT(&PyType_Type)
7974 0, /* ob_size */
7975 "unicode", /* tp_name */
7976 sizeof(PyUnicodeObject), /* tp_size */
7977 0, /* tp_itemsize */
7978 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007979 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007981 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00007983 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007984 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007985 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007987 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988 (hashfunc) unicode_hash, /* tp_hash*/
7989 0, /* tp_call*/
7990 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007991 PyObject_GenericGetAttr, /* tp_getattro */
7992 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007994 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7995 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007996 unicode_doc, /* tp_doc */
7997 0, /* tp_traverse */
7998 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00007999 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008000 0, /* tp_weaklistoffset */
8001 0, /* tp_iter */
8002 0, /* tp_iternext */
8003 unicode_methods, /* tp_methods */
8004 0, /* tp_members */
8005 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008006 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008007 0, /* tp_dict */
8008 0, /* tp_descr_get */
8009 0, /* tp_descr_set */
8010 0, /* tp_dictoffset */
8011 0, /* tp_init */
8012 0, /* tp_alloc */
8013 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008014 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015};
8016
8017/* Initialize the Unicode implementation */
8018
Thomas Wouters78890102000-07-22 19:25:51 +00008019void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008021 int i;
8022
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008023 /* XXX - move this array to unicodectype.c ? */
8024 Py_UNICODE linebreak[] = {
8025 0x000A, /* LINE FEED */
8026 0x000D, /* CARRIAGE RETURN */
8027 0x001C, /* FILE SEPARATOR */
8028 0x001D, /* GROUP SEPARATOR */
8029 0x001E, /* RECORD SEPARATOR */
8030 0x0085, /* NEXT LINE */
8031 0x2028, /* LINE SEPARATOR */
8032 0x2029, /* PARAGRAPH SEPARATOR */
8033 };
8034
Fred Drakee4315f52000-05-09 19:53:39 +00008035 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008036 unicode_freelist = NULL;
8037 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008039 if (!unicode_empty)
8040 return;
8041
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008042 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008043 for (i = 0; i < 256; i++)
8044 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008045 if (PyType_Ready(&PyUnicode_Type) < 0)
8046 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008047
8048 /* initialize the linebreak bloom filter */
8049 bloom_linebreak = make_bloom_mask(
8050 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8051 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008052
8053 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054}
8055
8056/* Finalize the Unicode implementation */
8057
8058void
Thomas Wouters78890102000-07-22 19:25:51 +00008059_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008061 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008062 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008064 Py_XDECREF(unicode_empty);
8065 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008066
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008067 for (i = 0; i < 256; i++) {
8068 if (unicode_latin1[i]) {
8069 Py_DECREF(unicode_latin1[i]);
8070 unicode_latin1[i] = NULL;
8071 }
8072 }
8073
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008074 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075 PyUnicodeObject *v = u;
8076 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008077 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008078 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008079 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008080 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008082 unicode_freelist = NULL;
8083 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008085
Anthony Baxterac6bd462006-04-13 02:06:09 +00008086#ifdef __cplusplus
8087}
8088#endif
8089
8090
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008091/*
8092Local variables:
8093c-basic-offset: 4
8094indent-tabs-mode: nil
8095End:
8096*/