blob: 10a8385b9cdff5a6a63c471fa606a381fd55e940 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
Fredrik Lundh77633512006-05-23 19:47:35 +0000166 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000187
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000199 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000205 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 PyErr_NoMemory();
207 return -1;
208 }
209 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000210 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000212 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 }
218 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000219
Guido van Rossumd57fd912000-03-10 22:53:23 +0000220 return 0;
221}
222
223/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000224 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
228
229*/
230
231static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233{
234 register PyUnicodeObject *unicode;
235
Andrew Dalkee0df7622006-05-27 11:04:36 +0000236 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
240 }
241
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000245 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000250 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000251 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 }
255 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000256 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000258 }
259 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266 }
267
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000268 if (!unicode->str) {
269 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000270 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000271 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000272 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
278 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000283 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000288 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290}
291
292static
Guido van Rossum9475a232001-10-05 20:51:39 +0000293void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000299 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 unicode->str = NULL;
301 unicode->length = 0;
302 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000306 }
307 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000313 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000314 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000315 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317}
318
Martin v. Löwis18e16552006-02-15 17:27:45 +0000319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000320{
321 register PyUnicodeObject *v;
322
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
327 }
328 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000329 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 PyErr_BadInternalCall();
331 return -1;
332 }
333
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000337 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000344 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000345 *unicode = (PyObject *)w;
346 return 0;
347 }
348
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360{
361 PyUnicodeObject *unicode;
362
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
366
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
371 }
372
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 if (!unicode)
380 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000381 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000382 unicode_latin1[*u] = unicode;
383 }
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
386 }
387 }
Tim Petersced69f82003-09-16 20:30:58 +0000388
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
392
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396
397 return (PyObject *)unicode;
398}
399
400#ifdef HAVE_WCHAR_H
401
402PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000403 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404{
405 PyUnicodeObject *unicode;
406
407 if (w == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
411
412 unicode = _PyUnicode_New(size);
413 if (!unicode)
414 return NULL;
415
416 /* Copy the wchar_t data into the new object */
417#ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000419#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000420 {
421 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000422 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000424 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 *u++ = *w++;
426 }
427#endif
428
429 return (PyObject *)unicode;
430}
431
Martin v. Löwis18e16552006-02-15 17:27:45 +0000432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433 wchar_t *w,
434 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435{
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
439 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000440
441 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000443 size = PyUnicode_GET_SIZE(unicode) + 1;
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445#ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w, unicode->str, size * sizeof(wchar_t));
447#else
448 {
449 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000450 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000452 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453 *w++ = *u++;
454 }
455#endif
456
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000457 if (size > PyUnicode_GET_SIZE(unicode))
458 return PyUnicode_GET_SIZE(unicode);
459 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 return size;
461}
462
463#endif
464
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000465PyObject *PyUnicode_FromOrdinal(int ordinal)
466{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000467 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000468
469#ifdef Py_UNICODE_WIDE
470 if (ordinal < 0 || ordinal > 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
474 return NULL;
475 }
476#else
477 if (ordinal < 0 || ordinal > 0xffff) {
478 PyErr_SetString(PyExc_ValueError,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
481 return NULL;
482 }
483#endif
484
Hye-Shik Chang40574832004-04-06 07:24:51 +0000485 s[0] = (Py_UNICODE)ordinal;
486 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000487}
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489PyObject *PyUnicode_FromObject(register PyObject *obj)
490{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj)) {
494 Py_INCREF(obj);
495 return obj;
496 }
497 if (PyUnicode_Check(obj)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501 PyUnicode_GET_SIZE(obj));
502 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000503 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504}
505
506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507 const char *encoding,
508 const char *errors)
509{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000511 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 if (obj == NULL) {
515 PyErr_BadInternalCall();
516 return NULL;
517 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000518
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000519#if 0
520 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
523 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000524
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
527
528 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529 if (PyUnicode_Check(obj)) {
530 if (encoding) {
531 PyErr_SetString(PyExc_TypeError,
532 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000533 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000536 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537#else
538 if (PyUnicode_Check(obj)) {
539 PyErr_SetString(PyExc_TypeError,
540 "decoding Unicode is not supported");
541 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000542 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000543#endif
544
545 /* Coerce object */
546 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000547 s = PyString_AS_STRING(obj);
548 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000549 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000550 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 "coercing to Unicode: need string or buffer, "
556 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000557 obj->ob_type->tp_name);
558 goto onError;
559 }
Tim Petersced69f82003-09-16 20:30:58 +0000560
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000561 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 if (len == 0) {
563 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000564 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 }
Tim Petersced69f82003-09-16 20:30:58 +0000566 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000567 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000568
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 return v;
570
571 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573}
574
575PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000576 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 const char *encoding,
578 const char *errors)
579{
580 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000581
582 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000583 encoding = PyUnicode_GetDefaultEncoding();
584
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000587 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000588 else if (strcmp(encoding, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s, size, errors);
593#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596
597 /* Decode via the codec registry */
598 buffer = PyBuffer_FromMemory((void *)s, size);
599 if (buffer == NULL)
600 goto onError;
601 unicode = PyCodec_Decode(buffer, encoding, errors);
602 if (unicode == NULL)
603 goto onError;
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000606 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607 unicode->ob_type->tp_name);
608 Py_DECREF(unicode);
609 goto onError;
610 }
611 Py_DECREF(buffer);
612 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000613
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 onError:
615 Py_XDECREF(buffer);
616 return NULL;
617}
618
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620 const char *encoding,
621 const char *errors)
622{
623 PyObject *v;
624
625 if (!PyUnicode_Check(unicode)) {
626 PyErr_BadArgument();
627 goto onError;
628 }
629
630 if (encoding == NULL)
631 encoding = PyUnicode_GetDefaultEncoding();
632
633 /* Decode via the codec registry */
634 v = PyCodec_Decode(unicode, encoding, errors);
635 if (v == NULL)
636 goto onError;
637 return v;
638
639 onError:
640 return NULL;
641}
642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 const char *encoding,
646 const char *errors)
647{
648 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = PyUnicode_FromUnicode(s, size);
651 if (unicode == NULL)
652 return NULL;
653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654 Py_DECREF(unicode);
655 return v;
656}
657
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659 const char *encoding,
660 const char *errors)
661{
662 PyObject *v;
663
664 if (!PyUnicode_Check(unicode)) {
665 PyErr_BadArgument();
666 goto onError;
667 }
668
669 if (encoding == NULL)
670 encoding = PyUnicode_GetDefaultEncoding();
671
672 /* Encode via the codec registry */
673 v = PyCodec_Encode(unicode, encoding, errors);
674 if (v == NULL)
675 goto onError;
676 return v;
677
678 onError:
679 return NULL;
680}
681
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683 const char *encoding,
684 const char *errors)
685{
686 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000687
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688 if (!PyUnicode_Check(unicode)) {
689 PyErr_BadArgument();
690 goto onError;
691 }
Fred Drakee4315f52000-05-09 19:53:39 +0000692
Tim Petersced69f82003-09-16 20:30:58 +0000693 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000694 encoding = PyUnicode_GetDefaultEncoding();
695
696 /* Shortcuts for common default encodings */
697 if (errors == NULL) {
698 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000699 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000700 else if (strcmp(encoding, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode);
705#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000706 else if (strcmp(encoding, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode);
708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709
710 /* Encode via the codec registry */
711 v = PyCodec_Encode(unicode, encoding, errors);
712 if (v == NULL)
713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 if (!PyString_Check(v)) {
715 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000716 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 v->ob_type->tp_name);
718 Py_DECREF(v);
719 goto onError;
720 }
721 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000722
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 onError:
724 return NULL;
725}
726
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728 const char *errors)
729{
730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
731
732 if (v)
733 return v;
734 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735 if (v && errors == NULL)
736 ((PyUnicodeObject *)unicode)->defenc = v;
737 return v;
738}
739
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
741{
742 if (!PyUnicode_Check(unicode)) {
743 PyErr_BadArgument();
744 goto onError;
745 }
746 return PyUnicode_AS_UNICODE(unicode);
747
748 onError:
749 return NULL;
750}
751
Martin v. Löwis18e16552006-02-15 17:27:45 +0000752Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753{
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
757 }
758 return PyUnicode_GET_SIZE(unicode);
759
760 onError:
761 return -1;
762}
763
Thomas Wouters78890102000-07-22 19:25:51 +0000764const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000765{
766 return unicode_default_encoding;
767}
768
769int PyUnicode_SetDefaultEncoding(const char *encoding)
770{
771 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000772
Fred Drakee4315f52000-05-09 19:53:39 +0000773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v = _PyCodec_Lookup(encoding);
776 if (v == NULL)
777 goto onError;
778 Py_DECREF(v);
779 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000780 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000781 sizeof(unicode_default_encoding));
782 return 0;
783
784 onError:
785 return -1;
786}
787
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000788/* error handling callback helper:
789 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000790 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000791 and adjust various state variables.
792 return 0 on success, -1 on error
793*/
794
795static
796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
799 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000800{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000801 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000802
803 PyObject *restuple = NULL;
804 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000805 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
806 Py_ssize_t requiredsize;
807 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000808 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000809 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000810 int res = -1;
811
812 if (*errorHandler == NULL) {
813 *errorHandler = PyCodec_LookupError(errors);
814 if (*errorHandler == NULL)
815 goto onError;
816 }
817
818 if (*exceptionObject == NULL) {
819 *exceptionObject = PyUnicodeDecodeError_Create(
820 encoding, input, insize, *startinpos, *endinpos, reason);
821 if (*exceptionObject == NULL)
822 goto onError;
823 }
824 else {
825 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
826 goto onError;
827 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
828 goto onError;
829 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
830 goto onError;
831 }
832
833 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
834 if (restuple == NULL)
835 goto onError;
836 if (!PyTuple_Check(restuple)) {
837 PyErr_Format(PyExc_TypeError, &argparse[4]);
838 goto onError;
839 }
840 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
841 goto onError;
842 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000843 newpos = insize+newpos;
844 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000845 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000846 goto onError;
847 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000848
849 /* need more space? (at least enough for what we
850 have+the replacement+the rest of the string (starting
851 at the new input position), so we won't have to check space
852 when there are no errors in the rest of the string) */
853 repptr = PyUnicode_AS_UNICODE(repunicode);
854 repsize = PyUnicode_GET_SIZE(repunicode);
855 requiredsize = *outpos + repsize + insize-newpos;
856 if (requiredsize > outsize) {
857 if (requiredsize<2*outsize)
858 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000859 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000860 goto onError;
861 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
862 }
863 *endinpos = newpos;
864 *inptr = input + newpos;
865 Py_UNICODE_COPY(*outptr, repptr, repsize);
866 *outptr += repsize;
867 *outpos += repsize;
868 /* we made it! */
869 res = 0;
870
871 onError:
872 Py_XDECREF(restuple);
873 return res;
874}
875
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000876/* --- UTF-7 Codec -------------------------------------------------------- */
877
878/* see RFC2152 for details */
879
Tim Petersced69f82003-09-16 20:30:58 +0000880static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881char utf7_special[128] = {
882 /* indicate whether a UTF-7 character is special i.e. cannot be directly
883 encoded:
884 0 - not special
885 1 - special
886 2 - whitespace (optional)
887 3 - RFC2152 Set O (optional) */
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
892 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
894 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
896
897};
898
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000899/* Note: The comparison (c) <= 0 is a trick to work-around gcc
900 warnings about the comparison always being false; since
901 utf7_special[0] is 1, we can safely make that one comparison
902 true */
903
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000904#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000905 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000906 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907 (encodeO && (utf7_special[(c)] == 3)))
908
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000909#define B64(n) \
910 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
911#define B64CHAR(c) \
912 (isalnum(c) || (c) == '+' || (c) == '/')
913#define UB64(c) \
914 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
915 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000916
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000917#define ENCODE(out, ch, bits) \
918 while (bits >= 6) { \
919 *out++ = B64(ch >> (bits-6)); \
920 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921 }
922
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000923#define DECODE(out, ch, bits, surrogate) \
924 while (bits >= 16) { \
925 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
926 bits -= 16; \
927 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000928 /* We have already generated an error for the high surrogate \
929 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000930 surrogate = 0; \
931 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000932 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000933 it in a 16-bit character */ \
934 surrogate = 1; \
935 errmsg = "code pairs are not supported"; \
936 goto utf7Error; \
937 } else { \
938 *out++ = outCh; \
939 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000940 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000941
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000943 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 const char *errors)
945{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000946 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000947 Py_ssize_t startinpos;
948 Py_ssize_t endinpos;
949 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950 const char *e;
951 PyUnicodeObject *unicode;
952 Py_UNICODE *p;
953 const char *errmsg = "";
954 int inShift = 0;
955 unsigned int bitsleft = 0;
956 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000957 int surrogate = 0;
958 PyObject *errorHandler = NULL;
959 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000960
961 unicode = _PyUnicode_New(size);
962 if (!unicode)
963 return NULL;
964 if (size == 0)
965 return (PyObject *)unicode;
966
967 p = unicode->str;
968 e = s + size;
969
970 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000971 Py_UNICODE ch;
972 restart:
973 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974
975 if (inShift) {
976 if ((ch == '-') || !B64CHAR(ch)) {
977 inShift = 0;
978 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000979
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
981 if (bitsleft >= 6) {
982 /* The shift sequence has a partial character in it. If
983 bitsleft < 6 then we could just classify it as padding
984 but that is not the case here */
985
986 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000987 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000988 }
989 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000990 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000991 here so indicate the potential of a misencoded character. */
992
993 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
994 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
995 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000996 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 }
998
999 if (ch == '-') {
1000 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001001 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 inShift = 1;
1003 }
1004 } else if (SPECIAL(ch,0,0)) {
1005 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001006 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 } else {
1008 *p++ = ch;
1009 }
1010 } else {
1011 charsleft = (charsleft << 6) | UB64(ch);
1012 bitsleft += 6;
1013 s++;
1014 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1015 }
1016 }
1017 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001019 s++;
1020 if (s < e && *s == '-') {
1021 s++;
1022 *p++ = '+';
1023 } else
1024 {
1025 inShift = 1;
1026 bitsleft = 0;
1027 }
1028 }
1029 else if (SPECIAL(ch,0,0)) {
1030 errmsg = "unexpected special character";
1031 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001032 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001033 }
1034 else {
1035 *p++ = ch;
1036 s++;
1037 }
1038 continue;
1039 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001040 outpos = p-PyUnicode_AS_UNICODE(unicode);
1041 endinpos = s-starts;
1042 if (unicode_decode_call_errorhandler(
1043 errors, &errorHandler,
1044 "utf7", errmsg,
1045 starts, size, &startinpos, &endinpos, &exc, &s,
1046 (PyObject **)&unicode, &outpos, &p))
1047 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001048 }
1049
1050 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001051 outpos = p-PyUnicode_AS_UNICODE(unicode);
1052 endinpos = size;
1053 if (unicode_decode_call_errorhandler(
1054 errors, &errorHandler,
1055 "utf7", "unterminated shift sequence",
1056 starts, size, &startinpos, &endinpos, &exc, &s,
1057 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001059 if (s < e)
1060 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001063 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001064 goto onError;
1065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001066 Py_XDECREF(errorHandler);
1067 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 return (PyObject *)unicode;
1069
1070onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071 Py_XDECREF(errorHandler);
1072 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 Py_DECREF(unicode);
1074 return NULL;
1075}
1076
1077
1078PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001079 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080 int encodeSetO,
1081 int encodeWhiteSpace,
1082 const char *errors)
1083{
1084 PyObject *v;
1085 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001086 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001087 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001089 unsigned int bitsleft = 0;
1090 unsigned long charsleft = 0;
1091 char * out;
1092 char * start;
1093
1094 if (size == 0)
1095 return PyString_FromStringAndSize(NULL, 0);
1096
1097 v = PyString_FromStringAndSize(NULL, cbAllocated);
1098 if (v == NULL)
1099 return NULL;
1100
1101 start = out = PyString_AS_STRING(v);
1102 for (;i < size; ++i) {
1103 Py_UNICODE ch = s[i];
1104
1105 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001106 if (ch == '+') {
1107 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001108 *out++ = '-';
1109 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1110 charsleft = ch;
1111 bitsleft = 16;
1112 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001113 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001114 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001115 } else {
1116 *out++ = (char) ch;
1117 }
1118 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1120 *out++ = B64(charsleft << (6-bitsleft));
1121 charsleft = 0;
1122 bitsleft = 0;
1123 /* Characters not in the BASE64 set implicitly unshift the sequence
1124 so no '-' is required, except if the character is itself a '-' */
1125 if (B64CHAR(ch) || ch == '-') {
1126 *out++ = '-';
1127 }
1128 inShift = 0;
1129 *out++ = (char) ch;
1130 } else {
1131 bitsleft += 16;
1132 charsleft = (charsleft << 16) | ch;
1133 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1134
1135 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001136 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001137 or '-' then the shift sequence will be terminated implicitly and we
1138 don't have to insert a '-'. */
1139
1140 if (bitsleft == 0) {
1141 if (i + 1 < size) {
1142 Py_UNICODE ch2 = s[i+1];
1143
1144 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001145
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001146 } else if (B64CHAR(ch2) || ch2 == '-') {
1147 *out++ = '-';
1148 inShift = 0;
1149 } else {
1150 inShift = 0;
1151 }
1152
1153 }
1154 else {
1155 *out++ = '-';
1156 inShift = 0;
1157 }
1158 }
Tim Petersced69f82003-09-16 20:30:58 +00001159 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001160 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001161 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001162 if (bitsleft) {
1163 *out++= B64(charsleft << (6-bitsleft) );
1164 *out++ = '-';
1165 }
1166
Tim Peters5de98422002-04-27 18:44:32 +00001167 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001168 return v;
1169}
1170
1171#undef SPECIAL
1172#undef B64
1173#undef B64CHAR
1174#undef UB64
1175#undef ENCODE
1176#undef DECODE
1177
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178/* --- UTF-8 Codec -------------------------------------------------------- */
1179
Tim Petersced69f82003-09-16 20:30:58 +00001180static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181char utf8_code_length[256] = {
1182 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1183 illegal prefix. see RFC 2279 for details */
1184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1200};
1201
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001203 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 const char *errors)
1205{
Walter Dörwald69652032004-09-07 20:24:22 +00001206 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1207}
1208
1209PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001210 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001211 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001212 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001213{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001214 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001216 Py_ssize_t startinpos;
1217 Py_ssize_t endinpos;
1218 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 const char *e;
1220 PyUnicodeObject *unicode;
1221 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001222 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001223 PyObject *errorHandler = NULL;
1224 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
1226 /* Note: size will always be longer than the resulting Unicode
1227 character count */
1228 unicode = _PyUnicode_New(size);
1229 if (!unicode)
1230 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001231 if (size == 0) {
1232 if (consumed)
1233 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236
1237 /* Unpack UTF-8 encoded data */
1238 p = unicode->str;
1239 e = s + size;
1240
1241 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001242 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243
1244 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001245 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 s++;
1247 continue;
1248 }
1249
1250 n = utf8_code_length[ch];
1251
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001252 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001253 if (consumed)
1254 break;
1255 else {
1256 errmsg = "unexpected end of data";
1257 startinpos = s-starts;
1258 endinpos = size;
1259 goto utf8Error;
1260 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262
1263 switch (n) {
1264
1265 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001266 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001267 startinpos = s-starts;
1268 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001269 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001270
1271 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001272 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273 startinpos = s-starts;
1274 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001275 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276
1277 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001278 if ((s[1] & 0xc0) != 0x80) {
1279 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
1283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 errmsg = "illegal encoding";
1289 goto utf8Error;
1290 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001292 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 break;
1294
1295 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001296 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 (s[2] & 0xc0) != 0x80) {
1298 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299 startinpos = s-starts;
1300 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 goto utf8Error;
1302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001304 if (ch < 0x0800) {
1305 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001306 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001307
1308 XXX For wide builds (UCS-4) we should probably try
1309 to recombine the surrogates into a single code
1310 unit.
1311 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001312 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001313 startinpos = s-starts;
1314 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001315 goto utf8Error;
1316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001318 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001319 break;
1320
1321 case 4:
1322 if ((s[1] & 0xc0) != 0x80 ||
1323 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001324 (s[3] & 0xc0) != 0x80) {
1325 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 startinpos = s-starts;
1327 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001328 goto utf8Error;
1329 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001330 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1331 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1332 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001333 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001334 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001335 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001336 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001337 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001338 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 startinpos = s-starts;
1340 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001341 goto utf8Error;
1342 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001343#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001344 *p++ = (Py_UNICODE)ch;
1345#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001346 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001347
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 /* translate from 10000..10FFFF to 0..FFFF */
1349 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001350
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001351 /* high surrogate = top 10 bits added to D800 */
1352 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001353
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001354 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001355 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001356#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 break;
1358
1359 default:
1360 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001361 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001362 startinpos = s-starts;
1363 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001364 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 }
1366 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001367 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001368
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001370 outpos = p-PyUnicode_AS_UNICODE(unicode);
1371 if (unicode_decode_call_errorhandler(
1372 errors, &errorHandler,
1373 "utf8", errmsg,
1374 starts, size, &startinpos, &endinpos, &exc, &s,
1375 (PyObject **)&unicode, &outpos, &p))
1376 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 }
Walter Dörwald69652032004-09-07 20:24:22 +00001378 if (consumed)
1379 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380
1381 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001382 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001383 goto onError;
1384
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001385 Py_XDECREF(errorHandler);
1386 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 return (PyObject *)unicode;
1388
1389onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 Py_XDECREF(errorHandler);
1391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 Py_DECREF(unicode);
1393 return NULL;
1394}
1395
Tim Peters602f7402002-04-27 18:03:26 +00001396/* Allocation strategy: if the string is short, convert into a stack buffer
1397 and allocate exactly as much space needed at the end. Else allocate the
1398 maximum possible needed (4 result bytes per Unicode character), and return
1399 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001400*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001401PyObject *
1402PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001403 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001404 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405{
Tim Peters602f7402002-04-27 18:03:26 +00001406#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001407
Martin v. Löwis18e16552006-02-15 17:27:45 +00001408 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001409 PyObject *v; /* result string object */
1410 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001412 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001413 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001414
Tim Peters602f7402002-04-27 18:03:26 +00001415 assert(s != NULL);
1416 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417
Tim Peters602f7402002-04-27 18:03:26 +00001418 if (size <= MAX_SHORT_UNICHARS) {
1419 /* Write into the stack buffer; nallocated can't overflow.
1420 * At the end, we'll allocate exactly as much heap space as it
1421 * turns out we need.
1422 */
1423 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1424 v = NULL; /* will allocate after we're done */
1425 p = stackbuf;
1426 }
1427 else {
1428 /* Overallocate on the heap, and give the excess back at the end. */
1429 nallocated = size * 4;
1430 if (nallocated / 4 != size) /* overflow! */
1431 return PyErr_NoMemory();
1432 v = PyString_FromStringAndSize(NULL, nallocated);
1433 if (v == NULL)
1434 return NULL;
1435 p = PyString_AS_STRING(v);
1436 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001437
Tim Peters602f7402002-04-27 18:03:26 +00001438 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001439 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001440
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001441 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001442 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001444
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001446 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001447 *p++ = (char)(0xc0 | (ch >> 6));
1448 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001449 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001450 else {
Tim Peters602f7402002-04-27 18:03:26 +00001451 /* Encode UCS2 Unicode ordinals */
1452 if (ch < 0x10000) {
1453 /* Special case: check for high surrogate */
1454 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1455 Py_UCS4 ch2 = s[i];
1456 /* Check for low surrogate and combine the two to
1457 form a UCS4 value */
1458 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001459 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001460 i++;
1461 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001462 }
Tim Peters602f7402002-04-27 18:03:26 +00001463 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001464 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001465 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001466 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1467 *p++ = (char)(0x80 | (ch & 0x3f));
1468 continue;
1469 }
1470encodeUCS4:
1471 /* Encode UCS4 Unicode ordinals */
1472 *p++ = (char)(0xf0 | (ch >> 18));
1473 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1474 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1475 *p++ = (char)(0x80 | (ch & 0x3f));
1476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001478
Tim Peters602f7402002-04-27 18:03:26 +00001479 if (v == NULL) {
1480 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001481 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001482 assert(nneeded <= nallocated);
1483 v = PyString_FromStringAndSize(stackbuf, nneeded);
1484 }
1485 else {
1486 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001487 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001488 assert(nneeded <= nallocated);
1489 _PyString_Resize(&v, nneeded);
1490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001492
Tim Peters602f7402002-04-27 18:03:26 +00001493#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494}
1495
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1497{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
1500 return NULL;
1501 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001502 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1503 PyUnicode_GET_SIZE(unicode),
1504 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505}
1506
1507/* --- UTF-16 Codec ------------------------------------------------------- */
1508
Tim Peters772747b2001-08-09 22:21:55 +00001509PyObject *
1510PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001511 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001512 const char *errors,
1513 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001514{
Walter Dörwald69652032004-09-07 20:24:22 +00001515 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1516}
1517
1518PyObject *
1519PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001520 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001521 const char *errors,
1522 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001523 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001524{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001526 Py_ssize_t startinpos;
1527 Py_ssize_t endinpos;
1528 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529 PyUnicodeObject *unicode;
1530 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001531 const unsigned char *q, *e;
1532 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001533 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001534 /* Offsets from q for retrieving byte pairs in the right order. */
1535#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536 int ihi = 1, ilo = 0;
1537#else
1538 int ihi = 0, ilo = 1;
1539#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 PyObject *errorHandler = NULL;
1541 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542
1543 /* Note: size will always be longer than the resulting Unicode
1544 character count */
1545 unicode = _PyUnicode_New(size);
1546 if (!unicode)
1547 return NULL;
1548 if (size == 0)
1549 return (PyObject *)unicode;
1550
1551 /* Unpack UTF-16 encoded data */
1552 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001553 q = (unsigned char *)s;
1554 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555
1556 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001557 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001559 /* Check for BOM marks (U+FEFF) in the input and adjust current
1560 byte order setting accordingly. In native mode, the leading BOM
1561 mark is skipped, in all other modes, it is copied to the output
1562 stream as-is (giving a ZWNBSP character). */
1563 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001564 if (size >= 2) {
1565 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001566#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001567 if (bom == 0xFEFF) {
1568 q += 2;
1569 bo = -1;
1570 }
1571 else if (bom == 0xFFFE) {
1572 q += 2;
1573 bo = 1;
1574 }
Tim Petersced69f82003-09-16 20:30:58 +00001575#else
Walter Dörwald69652032004-09-07 20:24:22 +00001576 if (bom == 0xFEFF) {
1577 q += 2;
1578 bo = 1;
1579 }
1580 else if (bom == 0xFFFE) {
1581 q += 2;
1582 bo = -1;
1583 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001584#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001585 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587
Tim Peters772747b2001-08-09 22:21:55 +00001588 if (bo == -1) {
1589 /* force LE */
1590 ihi = 1;
1591 ilo = 0;
1592 }
1593 else if (bo == 1) {
1594 /* force BE */
1595 ihi = 0;
1596 ilo = 1;
1597 }
1598
1599 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001601 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001602 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001603 if (consumed)
1604 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001605 errmsg = "truncated data";
1606 startinpos = ((const char *)q)-starts;
1607 endinpos = ((const char *)e)-starts;
1608 goto utf16Error;
1609 /* The remaining input chars are ignored if the callback
1610 chooses to skip the input */
1611 }
1612 ch = (q[ihi] << 8) | q[ilo];
1613
Tim Peters772747b2001-08-09 22:21:55 +00001614 q += 2;
1615
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 if (ch < 0xD800 || ch > 0xDFFF) {
1617 *p++ = ch;
1618 continue;
1619 }
1620
1621 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001622 if (q >= e) {
1623 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001624 startinpos = (((const char *)q)-2)-starts;
1625 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001626 goto utf16Error;
1627 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001628 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001629 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1630 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001631 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001632#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001633 *p++ = ch;
1634 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001635#else
1636 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001637#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001638 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001639 }
1640 else {
1641 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 startinpos = (((const char *)q)-4)-starts;
1643 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001644 goto utf16Error;
1645 }
1646
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001648 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 startinpos = (((const char *)q)-2)-starts;
1650 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001651 /* Fall through to report the error */
1652
1653 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001654 outpos = p-PyUnicode_AS_UNICODE(unicode);
1655 if (unicode_decode_call_errorhandler(
1656 errors, &errorHandler,
1657 "utf16", errmsg,
1658 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1659 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 }
1662
1663 if (byteorder)
1664 *byteorder = bo;
1665
Walter Dörwald69652032004-09-07 20:24:22 +00001666 if (consumed)
1667 *consumed = (const char *)q-starts;
1668
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001670 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671 goto onError;
1672
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001673 Py_XDECREF(errorHandler);
1674 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675 return (PyObject *)unicode;
1676
1677onError:
1678 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001679 Py_XDECREF(errorHandler);
1680 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 return NULL;
1682}
1683
Tim Peters772747b2001-08-09 22:21:55 +00001684PyObject *
1685PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001686 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001687 const char *errors,
1688 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001689{
1690 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001691 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001692#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001693 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001694#else
1695 const int pairs = 0;
1696#endif
Tim Peters772747b2001-08-09 22:21:55 +00001697 /* Offsets from p for storing byte pairs in the right order. */
1698#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699 int ihi = 1, ilo = 0;
1700#else
1701 int ihi = 0, ilo = 1;
1702#endif
1703
1704#define STORECHAR(CH) \
1705 do { \
1706 p[ihi] = ((CH) >> 8) & 0xff; \
1707 p[ilo] = (CH) & 0xff; \
1708 p += 2; \
1709 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001711#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001712 for (i = pairs = 0; i < size; i++)
1713 if (s[i] >= 0x10000)
1714 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001715#endif
Tim Petersced69f82003-09-16 20:30:58 +00001716 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001717 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718 if (v == NULL)
1719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720
Tim Peters772747b2001-08-09 22:21:55 +00001721 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001723 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001724 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001725 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001726
1727 if (byteorder == -1) {
1728 /* force LE */
1729 ihi = 1;
1730 ilo = 0;
1731 }
1732 else if (byteorder == 1) {
1733 /* force BE */
1734 ihi = 0;
1735 ilo = 1;
1736 }
1737
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001738 while (size-- > 0) {
1739 Py_UNICODE ch = *s++;
1740 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001741#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001742 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001743 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1744 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001746#endif
Tim Peters772747b2001-08-09 22:21:55 +00001747 STORECHAR(ch);
1748 if (ch2)
1749 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001752#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753}
1754
1755PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1756{
1757 if (!PyUnicode_Check(unicode)) {
1758 PyErr_BadArgument();
1759 return NULL;
1760 }
1761 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1762 PyUnicode_GET_SIZE(unicode),
1763 NULL,
1764 0);
1765}
1766
1767/* --- Unicode Escape Codec ----------------------------------------------- */
1768
Fredrik Lundh06d12682001-01-24 07:59:11 +00001769static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001770
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001772 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773 const char *errors)
1774{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001776 Py_ssize_t startinpos;
1777 Py_ssize_t endinpos;
1778 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001781 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001783 char* message;
1784 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001785 PyObject *errorHandler = NULL;
1786 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001787
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 /* Escaped strings will always be longer than the resulting
1789 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 length after conversion to the true value.
1791 (but if the error callback returns a long replacement string
1792 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 v = _PyUnicode_New(size);
1794 if (v == NULL)
1795 goto onError;
1796 if (size == 0)
1797 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001798
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001799 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001801
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 while (s < end) {
1803 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001804 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001805 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806
1807 /* Non-escape characters are interpreted as Unicode ordinals */
1808 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001809 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 continue;
1811 }
1812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001813 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 /* \ - Escapes */
1815 s++;
1816 switch (*s++) {
1817
1818 /* \x escapes */
1819 case '\n': break;
1820 case '\\': *p++ = '\\'; break;
1821 case '\'': *p++ = '\''; break;
1822 case '\"': *p++ = '\"'; break;
1823 case 'b': *p++ = '\b'; break;
1824 case 'f': *p++ = '\014'; break; /* FF */
1825 case 't': *p++ = '\t'; break;
1826 case 'n': *p++ = '\n'; break;
1827 case 'r': *p++ = '\r'; break;
1828 case 'v': *p++ = '\013'; break; /* VT */
1829 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1830
1831 /* \OOO (octal) escapes */
1832 case '0': case '1': case '2': case '3':
1833 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001834 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001836 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001838 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001840 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 break;
1842
Fredrik Lundhccc74732001-02-18 22:13:49 +00001843 /* hex escapes */
1844 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001846 digits = 2;
1847 message = "truncated \\xXX escape";
1848 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849
Fredrik Lundhccc74732001-02-18 22:13:49 +00001850 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001852 digits = 4;
1853 message = "truncated \\uXXXX escape";
1854 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001855
Fredrik Lundhccc74732001-02-18 22:13:49 +00001856 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001857 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001858 digits = 8;
1859 message = "truncated \\UXXXXXXXX escape";
1860 hexescape:
1861 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 outpos = p-PyUnicode_AS_UNICODE(v);
1863 if (s+digits>end) {
1864 endinpos = size;
1865 if (unicode_decode_call_errorhandler(
1866 errors, &errorHandler,
1867 "unicodeescape", "end of string in escape sequence",
1868 starts, size, &startinpos, &endinpos, &exc, &s,
1869 (PyObject **)&v, &outpos, &p))
1870 goto onError;
1871 goto nextByte;
1872 }
1873 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001874 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001875 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001876 endinpos = (s+i+1)-starts;
1877 if (unicode_decode_call_errorhandler(
1878 errors, &errorHandler,
1879 "unicodeescape", message,
1880 starts, size, &startinpos, &endinpos, &exc, &s,
1881 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001882 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001883 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001884 }
1885 chr = (chr<<4) & ~0xF;
1886 if (c >= '0' && c <= '9')
1887 chr += c - '0';
1888 else if (c >= 'a' && c <= 'f')
1889 chr += 10 + c - 'a';
1890 else
1891 chr += 10 + c - 'A';
1892 }
1893 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001894 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001895 /* _decoding_error will have already written into the
1896 target buffer. */
1897 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001898 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001899 /* when we get here, chr is a 32-bit unicode character */
1900 if (chr <= 0xffff)
1901 /* UCS-2 character */
1902 *p++ = (Py_UNICODE) chr;
1903 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001904 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001905 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001906#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001907 *p++ = chr;
1908#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001909 chr -= 0x10000L;
1910 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001911 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001913 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001914 endinpos = s-starts;
1915 outpos = p-PyUnicode_AS_UNICODE(v);
1916 if (unicode_decode_call_errorhandler(
1917 errors, &errorHandler,
1918 "unicodeescape", "illegal Unicode character",
1919 starts, size, &startinpos, &endinpos, &exc, &s,
1920 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001921 goto onError;
1922 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001923 break;
1924
1925 /* \N{name} */
1926 case 'N':
1927 message = "malformed \\N character escape";
1928 if (ucnhash_CAPI == NULL) {
1929 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001930 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001931 m = PyImport_ImportModule("unicodedata");
1932 if (m == NULL)
1933 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001934 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001935 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001936 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001937 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001938 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001939 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001940 if (ucnhash_CAPI == NULL)
1941 goto ucnhashError;
1942 }
1943 if (*s == '{') {
1944 const char *start = s+1;
1945 /* look for the closing brace */
1946 while (*s != '}' && s < end)
1947 s++;
1948 if (s > start && s < end && *s == '}') {
1949 /* found a name. look it up in the unicode database */
1950 message = "unknown Unicode character name";
1951 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001952 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001953 goto store;
1954 }
1955 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001956 endinpos = s-starts;
1957 outpos = p-PyUnicode_AS_UNICODE(v);
1958 if (unicode_decode_call_errorhandler(
1959 errors, &errorHandler,
1960 "unicodeescape", message,
1961 starts, size, &startinpos, &endinpos, &exc, &s,
1962 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001963 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001964 break;
1965
1966 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001967 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001968 message = "\\ at end of string";
1969 s--;
1970 endinpos = s-starts;
1971 outpos = p-PyUnicode_AS_UNICODE(v);
1972 if (unicode_decode_call_errorhandler(
1973 errors, &errorHandler,
1974 "unicodeescape", message,
1975 starts, size, &startinpos, &endinpos, &exc, &s,
1976 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001977 goto onError;
1978 }
1979 else {
1980 *p++ = '\\';
1981 *p++ = (unsigned char)s[-1];
1982 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001983 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001985 nextByte:
1986 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001988 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001989 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001990 Py_XDECREF(errorHandler);
1991 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001993
Fredrik Lundhccc74732001-02-18 22:13:49 +00001994ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001995 PyErr_SetString(
1996 PyExc_UnicodeError,
1997 "\\N escapes not supported (can't load unicodedata module)"
1998 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001999 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002000 Py_XDECREF(errorHandler);
2001 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002002 return NULL;
2003
Fredrik Lundhccc74732001-02-18 22:13:49 +00002004onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002006 Py_XDECREF(errorHandler);
2007 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 return NULL;
2009}
2010
2011/* Return a Unicode-Escape string version of the Unicode object.
2012
2013 If quotes is true, the string is enclosed in u"" or u'' quotes as
2014 appropriate.
2015
2016*/
2017
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002018Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002019 Py_ssize_t size,
2020 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002021{
2022 /* like wcschr, but doesn't stop at NULL characters */
2023
2024 while (size-- > 0) {
2025 if (*s == ch)
2026 return s;
2027 s++;
2028 }
2029
2030 return NULL;
2031}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002032
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033static
2034PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002035 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 int quotes)
2037{
2038 PyObject *repr;
2039 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002041 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002043 /* Initial allocation is based on the longest-possible unichr
2044 escape.
2045
2046 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2047 unichr, so in this case it's the longest unichr escape. In
2048 narrow (UTF-16) builds this is five chars per source unichr
2049 since there are two unichrs in the surrogate pair, so in narrow
2050 (UTF-16) builds it's not the longest unichr escape.
2051
2052 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2053 so in the narrow (UTF-16) build case it's the longest unichr
2054 escape.
2055 */
2056
2057 repr = PyString_FromStringAndSize(NULL,
2058 2
2059#ifdef Py_UNICODE_WIDE
2060 + 10*size
2061#else
2062 + 6*size
2063#endif
2064 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 if (repr == NULL)
2066 return NULL;
2067
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002068 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069
2070 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002072 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073 !findchar(s, size, '"')) ? '"' : '\'';
2074 }
2075 while (size-- > 0) {
2076 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002077
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002078 /* Escape quotes and backslashes */
2079 if ((quotes &&
2080 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081 *p++ = '\\';
2082 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002083 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002084 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002085
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002086#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002087 /* Map 21-bit characters to '\U00xxxxxx' */
2088 else if (ch >= 0x10000) {
2089 *p++ = '\\';
2090 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002091 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2092 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2093 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2094 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2095 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2096 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2097 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002098 *p++ = hexdigit[ch & 0x0000000F];
2099 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002100 }
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002101#else
2102 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002103 else if (ch >= 0xD800 && ch < 0xDC00) {
2104 Py_UNICODE ch2;
2105 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002106
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002107 ch2 = *s++;
2108 size--;
2109 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2110 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2111 *p++ = '\\';
2112 *p++ = 'U';
2113 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2114 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2115 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2116 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2117 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2118 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2119 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2120 *p++ = hexdigit[ucs & 0x0000000F];
2121 continue;
2122 }
2123 /* Fall through: isolated surrogates are copied as-is */
2124 s--;
2125 size++;
2126 }
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002127#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002128
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002130 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131 *p++ = '\\';
2132 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002133 *p++ = hexdigit[(ch >> 12) & 0x000F];
2134 *p++ = hexdigit[(ch >> 8) & 0x000F];
2135 *p++ = hexdigit[(ch >> 4) & 0x000F];
2136 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002137 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002138
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002139 /* Map special whitespace to '\t', \n', '\r' */
2140 else if (ch == '\t') {
2141 *p++ = '\\';
2142 *p++ = 't';
2143 }
2144 else if (ch == '\n') {
2145 *p++ = '\\';
2146 *p++ = 'n';
2147 }
2148 else if (ch == '\r') {
2149 *p++ = '\\';
2150 *p++ = 'r';
2151 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002152
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002153 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002154 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002155 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002156 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002157 *p++ = hexdigit[(ch >> 4) & 0x000F];
2158 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002159 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002160
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161 /* Copy everything else as-is */
2162 else
2163 *p++ = (char) ch;
2164 }
2165 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002166 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167
2168 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002169 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170 return repr;
2171}
2172
2173PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002174 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175{
2176 return unicodeescape_string(s, size, 0);
2177}
2178
2179PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2180{
2181 if (!PyUnicode_Check(unicode)) {
2182 PyErr_BadArgument();
2183 return NULL;
2184 }
2185 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2186 PyUnicode_GET_SIZE(unicode));
2187}
2188
2189/* --- Raw Unicode Escape Codec ------------------------------------------- */
2190
2191PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002192 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193 const char *errors)
2194{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002195 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002196 Py_ssize_t startinpos;
2197 Py_ssize_t endinpos;
2198 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002200 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201 const char *end;
2202 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002203 PyObject *errorHandler = NULL;
2204 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002205
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206 /* Escaped strings will always be longer than the resulting
2207 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002208 length after conversion to the true value. (But decoding error
2209 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210 v = _PyUnicode_New(size);
2211 if (v == NULL)
2212 goto onError;
2213 if (size == 0)
2214 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002215 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216 end = s + size;
2217 while (s < end) {
2218 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002219 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002221 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222
2223 /* Non-escape characters are interpreted as Unicode ordinals */
2224 if (*s != '\\') {
2225 *p++ = (unsigned char)*s++;
2226 continue;
2227 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002228 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002229
2230 /* \u-escapes are only interpreted iff the number of leading
2231 backslashes if odd */
2232 bs = s;
2233 for (;s < end;) {
2234 if (*s != '\\')
2235 break;
2236 *p++ = (unsigned char)*s++;
2237 }
2238 if (((s - bs) & 1) == 0 ||
2239 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002240 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241 continue;
2242 }
2243 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002244 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245 s++;
2246
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002247 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002248 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002249 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002250 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002252 endinpos = s-starts;
2253 if (unicode_decode_call_errorhandler(
2254 errors, &errorHandler,
2255 "rawunicodeescape", "truncated \\uXXXX",
2256 starts, size, &startinpos, &endinpos, &exc, &s,
2257 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002259 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002260 }
2261 x = (x<<4) & ~0xF;
2262 if (c >= '0' && c <= '9')
2263 x += c - '0';
2264 else if (c >= 'a' && c <= 'f')
2265 x += 10 + c - 'a';
2266 else
2267 x += 10 + c - 'A';
2268 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002269#ifndef Py_UNICODE_WIDE
2270 if (x > 0x10000) {
2271 if (unicode_decode_call_errorhandler(
2272 errors, &errorHandler,
2273 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2274 starts, size, &startinpos, &endinpos, &exc, &s,
2275 (PyObject **)&v, &outpos, &p))
2276 goto onError;
2277 }
2278#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002279 *p++ = x;
2280 nextByte:
2281 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002282 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002283 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002284 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002285 Py_XDECREF(errorHandler);
2286 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002287 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002288
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289 onError:
2290 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002291 Py_XDECREF(errorHandler);
2292 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002293 return NULL;
2294}
2295
2296PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002297 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298{
2299 PyObject *repr;
2300 char *p;
2301 char *q;
2302
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002303 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002305#ifdef Py_UNICODE_WIDE
2306 repr = PyString_FromStringAndSize(NULL, 10 * size);
2307#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002309#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310 if (repr == NULL)
2311 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002312 if (size == 0)
2313 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002314
2315 p = q = PyString_AS_STRING(repr);
2316 while (size-- > 0) {
2317 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002318#ifdef Py_UNICODE_WIDE
2319 /* Map 32-bit characters to '\Uxxxxxxxx' */
2320 if (ch >= 0x10000) {
2321 *p++ = '\\';
2322 *p++ = 'U';
2323 *p++ = hexdigit[(ch >> 28) & 0xf];
2324 *p++ = hexdigit[(ch >> 24) & 0xf];
2325 *p++ = hexdigit[(ch >> 20) & 0xf];
2326 *p++ = hexdigit[(ch >> 16) & 0xf];
2327 *p++ = hexdigit[(ch >> 12) & 0xf];
2328 *p++ = hexdigit[(ch >> 8) & 0xf];
2329 *p++ = hexdigit[(ch >> 4) & 0xf];
2330 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002331 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002332 else
2333#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334 /* Map 16-bit characters to '\uxxxx' */
2335 if (ch >= 256) {
2336 *p++ = '\\';
2337 *p++ = 'u';
2338 *p++ = hexdigit[(ch >> 12) & 0xf];
2339 *p++ = hexdigit[(ch >> 8) & 0xf];
2340 *p++ = hexdigit[(ch >> 4) & 0xf];
2341 *p++ = hexdigit[ch & 15];
2342 }
2343 /* Copy everything else as-is */
2344 else
2345 *p++ = (char) ch;
2346 }
2347 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002348 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349 return repr;
2350}
2351
2352PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2353{
2354 if (!PyUnicode_Check(unicode)) {
2355 PyErr_BadArgument();
2356 return NULL;
2357 }
2358 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2359 PyUnicode_GET_SIZE(unicode));
2360}
2361
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002362/* --- Unicode Internal Codec ------------------------------------------- */
2363
2364PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002365 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002366 const char *errors)
2367{
2368 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002369 Py_ssize_t startinpos;
2370 Py_ssize_t endinpos;
2371 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002372 PyUnicodeObject *v;
2373 Py_UNICODE *p;
2374 const char *end;
2375 const char *reason;
2376 PyObject *errorHandler = NULL;
2377 PyObject *exc = NULL;
2378
Neal Norwitzd43069c2006-01-08 01:12:10 +00002379#ifdef Py_UNICODE_WIDE
2380 Py_UNICODE unimax = PyUnicode_GetMax();
2381#endif
2382
Armin Rigo4b63c212006-10-04 11:44:06 +00002383 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002384 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2385 if (v == NULL)
2386 goto onError;
2387 if (PyUnicode_GetSize((PyObject *)v) == 0)
2388 return (PyObject *)v;
2389 p = PyUnicode_AS_UNICODE(v);
2390 end = s + size;
2391
2392 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002393 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002394 /* We have to sanity check the raw data, otherwise doom looms for
2395 some malformed UCS-4 data. */
2396 if (
2397 #ifdef Py_UNICODE_WIDE
2398 *p > unimax || *p < 0 ||
2399 #endif
2400 end-s < Py_UNICODE_SIZE
2401 )
2402 {
2403 startinpos = s - starts;
2404 if (end-s < Py_UNICODE_SIZE) {
2405 endinpos = end-starts;
2406 reason = "truncated input";
2407 }
2408 else {
2409 endinpos = s - starts + Py_UNICODE_SIZE;
2410 reason = "illegal code point (> 0x10FFFF)";
2411 }
2412 outpos = p - PyUnicode_AS_UNICODE(v);
2413 if (unicode_decode_call_errorhandler(
2414 errors, &errorHandler,
2415 "unicode_internal", reason,
2416 starts, size, &startinpos, &endinpos, &exc, &s,
2417 (PyObject **)&v, &outpos, &p)) {
2418 goto onError;
2419 }
2420 }
2421 else {
2422 p++;
2423 s += Py_UNICODE_SIZE;
2424 }
2425 }
2426
Martin v. Löwis412fb672006-04-13 06:34:32 +00002427 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002428 goto onError;
2429 Py_XDECREF(errorHandler);
2430 Py_XDECREF(exc);
2431 return (PyObject *)v;
2432
2433 onError:
2434 Py_XDECREF(v);
2435 Py_XDECREF(errorHandler);
2436 Py_XDECREF(exc);
2437 return NULL;
2438}
2439
Guido van Rossumd57fd912000-03-10 22:53:23 +00002440/* --- Latin-1 Codec ------------------------------------------------------ */
2441
2442PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002443 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002444 const char *errors)
2445{
2446 PyUnicodeObject *v;
2447 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002448
Guido van Rossumd57fd912000-03-10 22:53:23 +00002449 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002450 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002451 Py_UNICODE r = *(unsigned char*)s;
2452 return PyUnicode_FromUnicode(&r, 1);
2453 }
2454
Guido van Rossumd57fd912000-03-10 22:53:23 +00002455 v = _PyUnicode_New(size);
2456 if (v == NULL)
2457 goto onError;
2458 if (size == 0)
2459 return (PyObject *)v;
2460 p = PyUnicode_AS_UNICODE(v);
2461 while (size-- > 0)
2462 *p++ = (unsigned char)*s++;
2463 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002464
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465 onError:
2466 Py_XDECREF(v);
2467 return NULL;
2468}
2469
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002470/* create or adjust a UnicodeEncodeError */
2471static void make_encode_exception(PyObject **exceptionObject,
2472 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002473 const Py_UNICODE *unicode, Py_ssize_t size,
2474 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002475 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002477 if (*exceptionObject == NULL) {
2478 *exceptionObject = PyUnicodeEncodeError_Create(
2479 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 }
2481 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002482 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2483 goto onError;
2484 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2485 goto onError;
2486 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2487 goto onError;
2488 return;
2489 onError:
2490 Py_DECREF(*exceptionObject);
2491 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492 }
2493}
2494
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002495/* raises a UnicodeEncodeError */
2496static void raise_encode_exception(PyObject **exceptionObject,
2497 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002498 const Py_UNICODE *unicode, Py_ssize_t size,
2499 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002500 const char *reason)
2501{
2502 make_encode_exception(exceptionObject,
2503 encoding, unicode, size, startpos, endpos, reason);
2504 if (*exceptionObject != NULL)
2505 PyCodec_StrictErrors(*exceptionObject);
2506}
2507
2508/* error handling callback helper:
2509 build arguments, call the callback and check the arguments,
2510 put the result into newpos and return the replacement string, which
2511 has to be freed by the caller */
2512static PyObject *unicode_encode_call_errorhandler(const char *errors,
2513 PyObject **errorHandler,
2514 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002515 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2516 Py_ssize_t startpos, Py_ssize_t endpos,
2517 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002518{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002519 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002520
2521 PyObject *restuple;
2522 PyObject *resunicode;
2523
2524 if (*errorHandler == NULL) {
2525 *errorHandler = PyCodec_LookupError(errors);
2526 if (*errorHandler == NULL)
2527 return NULL;
2528 }
2529
2530 make_encode_exception(exceptionObject,
2531 encoding, unicode, size, startpos, endpos, reason);
2532 if (*exceptionObject == NULL)
2533 return NULL;
2534
2535 restuple = PyObject_CallFunctionObjArgs(
2536 *errorHandler, *exceptionObject, NULL);
2537 if (restuple == NULL)
2538 return NULL;
2539 if (!PyTuple_Check(restuple)) {
2540 PyErr_Format(PyExc_TypeError, &argparse[4]);
2541 Py_DECREF(restuple);
2542 return NULL;
2543 }
2544 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2545 &resunicode, newpos)) {
2546 Py_DECREF(restuple);
2547 return NULL;
2548 }
2549 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002550 *newpos = size+*newpos;
2551 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002552 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002553 Py_DECREF(restuple);
2554 return NULL;
2555 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002556 Py_INCREF(resunicode);
2557 Py_DECREF(restuple);
2558 return resunicode;
2559}
2560
2561static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002562 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002563 const char *errors,
2564 int limit)
2565{
2566 /* output object */
2567 PyObject *res;
2568 /* pointers to the beginning and end+1 of input */
2569 const Py_UNICODE *startp = p;
2570 const Py_UNICODE *endp = p + size;
2571 /* pointer to the beginning of the unencodable characters */
2572 /* const Py_UNICODE *badp = NULL; */
2573 /* pointer into the output */
2574 char *str;
2575 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002576 Py_ssize_t respos = 0;
2577 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002578 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2579 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002580 PyObject *errorHandler = NULL;
2581 PyObject *exc = NULL;
2582 /* the following variable is used for caching string comparisons
2583 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2584 int known_errorHandler = -1;
2585
2586 /* allocate enough for a simple encoding without
2587 replacements, if we need more, we'll resize */
2588 res = PyString_FromStringAndSize(NULL, size);
2589 if (res == NULL)
2590 goto onError;
2591 if (size == 0)
2592 return res;
2593 str = PyString_AS_STRING(res);
2594 ressize = size;
2595
2596 while (p<endp) {
2597 Py_UNICODE c = *p;
2598
2599 /* can we encode this? */
2600 if (c<limit) {
2601 /* no overflow check, because we know that the space is enough */
2602 *str++ = (char)c;
2603 ++p;
2604 }
2605 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002606 Py_ssize_t unicodepos = p-startp;
2607 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002608 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002609 Py_ssize_t repsize;
2610 Py_ssize_t newpos;
2611 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002612 Py_UNICODE *uni2;
2613 /* startpos for collecting unencodable chars */
2614 const Py_UNICODE *collstart = p;
2615 const Py_UNICODE *collend = p;
2616 /* find all unecodable characters */
2617 while ((collend < endp) && ((*collend)>=limit))
2618 ++collend;
2619 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2620 if (known_errorHandler==-1) {
2621 if ((errors==NULL) || (!strcmp(errors, "strict")))
2622 known_errorHandler = 1;
2623 else if (!strcmp(errors, "replace"))
2624 known_errorHandler = 2;
2625 else if (!strcmp(errors, "ignore"))
2626 known_errorHandler = 3;
2627 else if (!strcmp(errors, "xmlcharrefreplace"))
2628 known_errorHandler = 4;
2629 else
2630 known_errorHandler = 0;
2631 }
2632 switch (known_errorHandler) {
2633 case 1: /* strict */
2634 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2635 goto onError;
2636 case 2: /* replace */
2637 while (collstart++<collend)
2638 *str++ = '?'; /* fall through */
2639 case 3: /* ignore */
2640 p = collend;
2641 break;
2642 case 4: /* xmlcharrefreplace */
2643 respos = str-PyString_AS_STRING(res);
2644 /* determine replacement size (temporarily (mis)uses p) */
2645 for (p = collstart, repsize = 0; p < collend; ++p) {
2646 if (*p<10)
2647 repsize += 2+1+1;
2648 else if (*p<100)
2649 repsize += 2+2+1;
2650 else if (*p<1000)
2651 repsize += 2+3+1;
2652 else if (*p<10000)
2653 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002654#ifndef Py_UNICODE_WIDE
2655 else
2656 repsize += 2+5+1;
2657#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002658 else if (*p<100000)
2659 repsize += 2+5+1;
2660 else if (*p<1000000)
2661 repsize += 2+6+1;
2662 else
2663 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002664#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002665 }
2666 requiredsize = respos+repsize+(endp-collend);
2667 if (requiredsize > ressize) {
2668 if (requiredsize<2*ressize)
2669 requiredsize = 2*ressize;
2670 if (_PyString_Resize(&res, requiredsize))
2671 goto onError;
2672 str = PyString_AS_STRING(res) + respos;
2673 ressize = requiredsize;
2674 }
2675 /* generate replacement (temporarily (mis)uses p) */
2676 for (p = collstart; p < collend; ++p) {
2677 str += sprintf(str, "&#%d;", (int)*p);
2678 }
2679 p = collend;
2680 break;
2681 default:
2682 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2683 encoding, reason, startp, size, &exc,
2684 collstart-startp, collend-startp, &newpos);
2685 if (repunicode == NULL)
2686 goto onError;
2687 /* need more space? (at least enough for what we
2688 have+the replacement+the rest of the string, so
2689 we won't have to check space for encodable characters) */
2690 respos = str-PyString_AS_STRING(res);
2691 repsize = PyUnicode_GET_SIZE(repunicode);
2692 requiredsize = respos+repsize+(endp-collend);
2693 if (requiredsize > ressize) {
2694 if (requiredsize<2*ressize)
2695 requiredsize = 2*ressize;
2696 if (_PyString_Resize(&res, requiredsize)) {
2697 Py_DECREF(repunicode);
2698 goto onError;
2699 }
2700 str = PyString_AS_STRING(res) + respos;
2701 ressize = requiredsize;
2702 }
2703 /* check if there is anything unencodable in the replacement
2704 and copy it to the output */
2705 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2706 c = *uni2;
2707 if (c >= limit) {
2708 raise_encode_exception(&exc, encoding, startp, size,
2709 unicodepos, unicodepos+1, reason);
2710 Py_DECREF(repunicode);
2711 goto onError;
2712 }
2713 *str = (char)c;
2714 }
2715 p = startp + newpos;
2716 Py_DECREF(repunicode);
2717 }
2718 }
2719 }
2720 /* Resize if we allocated to much */
2721 respos = str-PyString_AS_STRING(res);
2722 if (respos<ressize)
2723 /* If this falls res will be NULL */
2724 _PyString_Resize(&res, respos);
2725 Py_XDECREF(errorHandler);
2726 Py_XDECREF(exc);
2727 return res;
2728
2729 onError:
2730 Py_XDECREF(res);
2731 Py_XDECREF(errorHandler);
2732 Py_XDECREF(exc);
2733 return NULL;
2734}
2735
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002737 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 const char *errors)
2739{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002740 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741}
2742
2743PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2744{
2745 if (!PyUnicode_Check(unicode)) {
2746 PyErr_BadArgument();
2747 return NULL;
2748 }
2749 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2750 PyUnicode_GET_SIZE(unicode),
2751 NULL);
2752}
2753
2754/* --- 7-bit ASCII Codec -------------------------------------------------- */
2755
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002757 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 const char *errors)
2759{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002760 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 PyUnicodeObject *v;
2762 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002763 Py_ssize_t startinpos;
2764 Py_ssize_t endinpos;
2765 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002766 const char *e;
2767 PyObject *errorHandler = NULL;
2768 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002769
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002771 if (size == 1 && *(unsigned char*)s < 128) {
2772 Py_UNICODE r = *(unsigned char*)s;
2773 return PyUnicode_FromUnicode(&r, 1);
2774 }
Tim Petersced69f82003-09-16 20:30:58 +00002775
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776 v = _PyUnicode_New(size);
2777 if (v == NULL)
2778 goto onError;
2779 if (size == 0)
2780 return (PyObject *)v;
2781 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002782 e = s + size;
2783 while (s < e) {
2784 register unsigned char c = (unsigned char)*s;
2785 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002787 ++s;
2788 }
2789 else {
2790 startinpos = s-starts;
2791 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002792 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002793 if (unicode_decode_call_errorhandler(
2794 errors, &errorHandler,
2795 "ascii", "ordinal not in range(128)",
2796 starts, size, &startinpos, &endinpos, &exc, &s,
2797 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002799 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002801 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002802 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002803 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002804 Py_XDECREF(errorHandler);
2805 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002807
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 onError:
2809 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002810 Py_XDECREF(errorHandler);
2811 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812 return NULL;
2813}
2814
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002816 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817 const char *errors)
2818{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002819 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002820}
2821
2822PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2823{
2824 if (!PyUnicode_Check(unicode)) {
2825 PyErr_BadArgument();
2826 return NULL;
2827 }
2828 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2829 PyUnicode_GET_SIZE(unicode),
2830 NULL);
2831}
2832
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002833#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002834
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002835/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002836
Martin v. Löwisd8251432006-06-14 05:21:04 +00002837#if SIZEOF_INT < SIZEOF_SSIZE_T
2838#define NEED_RETRY
2839#endif
2840
2841/* XXX This code is limited to "true" double-byte encodings, as
2842 a) it assumes an incomplete character consists of a single byte, and
2843 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2844 encodings, see IsDBCSLeadByteEx documentation. */
2845
2846static int is_dbcs_lead_byte(const char *s, int offset)
2847{
2848 const char *curr = s + offset;
2849
2850 if (IsDBCSLeadByte(*curr)) {
2851 const char *prev = CharPrev(s, curr);
2852 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2853 }
2854 return 0;
2855}
2856
2857/*
2858 * Decode MBCS string into unicode object. If 'final' is set, converts
2859 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2860 */
2861static int decode_mbcs(PyUnicodeObject **v,
2862 const char *s, /* MBCS string */
2863 int size, /* sizeof MBCS string */
2864 int final)
2865{
2866 Py_UNICODE *p;
2867 Py_ssize_t n = 0;
2868 int usize = 0;
2869
2870 assert(size >= 0);
2871
2872 /* Skip trailing lead-byte unless 'final' is set */
2873 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2874 --size;
2875
2876 /* First get the size of the result */
2877 if (size > 0) {
2878 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2879 if (usize == 0) {
2880 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2881 return -1;
2882 }
2883 }
2884
2885 if (*v == NULL) {
2886 /* Create unicode object */
2887 *v = _PyUnicode_New(usize);
2888 if (*v == NULL)
2889 return -1;
2890 }
2891 else {
2892 /* Extend unicode object */
2893 n = PyUnicode_GET_SIZE(*v);
2894 if (_PyUnicode_Resize(v, n + usize) < 0)
2895 return -1;
2896 }
2897
2898 /* Do the conversion */
2899 if (size > 0) {
2900 p = PyUnicode_AS_UNICODE(*v) + n;
2901 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2902 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2903 return -1;
2904 }
2905 }
2906
2907 return size;
2908}
2909
2910PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2911 Py_ssize_t size,
2912 const char *errors,
2913 Py_ssize_t *consumed)
2914{
2915 PyUnicodeObject *v = NULL;
2916 int done;
2917
2918 if (consumed)
2919 *consumed = 0;
2920
2921#ifdef NEED_RETRY
2922 retry:
2923 if (size > INT_MAX)
2924 done = decode_mbcs(&v, s, INT_MAX, 0);
2925 else
2926#endif
2927 done = decode_mbcs(&v, s, (int)size, !consumed);
2928
2929 if (done < 0) {
2930 Py_XDECREF(v);
2931 return NULL;
2932 }
2933
2934 if (consumed)
2935 *consumed += done;
2936
2937#ifdef NEED_RETRY
2938 if (size > INT_MAX) {
2939 s += done;
2940 size -= done;
2941 goto retry;
2942 }
2943#endif
2944
2945 return (PyObject *)v;
2946}
2947
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002948PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002949 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002950 const char *errors)
2951{
Martin v. Löwisd8251432006-06-14 05:21:04 +00002952 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2953}
2954
2955/*
2956 * Convert unicode into string object (MBCS).
2957 * Returns 0 if succeed, -1 otherwise.
2958 */
2959static int encode_mbcs(PyObject **repr,
2960 const Py_UNICODE *p, /* unicode */
2961 int size) /* size of unicode */
2962{
2963 int mbcssize = 0;
2964 Py_ssize_t n = 0;
2965
2966 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002967
2968 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00002969 if (size > 0) {
2970 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2971 if (mbcssize == 0) {
2972 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2973 return -1;
2974 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002975 }
2976
Martin v. Löwisd8251432006-06-14 05:21:04 +00002977 if (*repr == NULL) {
2978 /* Create string object */
2979 *repr = PyString_FromStringAndSize(NULL, mbcssize);
2980 if (*repr == NULL)
2981 return -1;
2982 }
2983 else {
2984 /* Extend string object */
2985 n = PyString_Size(*repr);
2986 if (_PyString_Resize(repr, n + mbcssize) < 0)
2987 return -1;
2988 }
2989
2990 /* Do the conversion */
2991 if (size > 0) {
2992 char *s = PyString_AS_STRING(*repr) + n;
2993 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2994 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2995 return -1;
2996 }
2997 }
2998
2999 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003000}
3001
3002PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003003 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003004 const char *errors)
3005{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003006 PyObject *repr = NULL;
3007 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003008
Martin v. Löwisd8251432006-06-14 05:21:04 +00003009#ifdef NEED_RETRY
3010 retry:
3011 if (size > INT_MAX)
3012 ret = encode_mbcs(&repr, p, INT_MAX);
3013 else
3014#endif
3015 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003016
Martin v. Löwisd8251432006-06-14 05:21:04 +00003017 if (ret < 0) {
3018 Py_XDECREF(repr);
3019 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003020 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003021
3022#ifdef NEED_RETRY
3023 if (size > INT_MAX) {
3024 p += INT_MAX;
3025 size -= INT_MAX;
3026 goto retry;
3027 }
3028#endif
3029
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003030 return repr;
3031}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003032
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003033PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3034{
3035 if (!PyUnicode_Check(unicode)) {
3036 PyErr_BadArgument();
3037 return NULL;
3038 }
3039 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3040 PyUnicode_GET_SIZE(unicode),
3041 NULL);
3042}
3043
Martin v. Löwisd8251432006-06-14 05:21:04 +00003044#undef NEED_RETRY
3045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003046#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003047
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048/* --- Character Mapping Codec -------------------------------------------- */
3049
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003051 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 PyObject *mapping,
3053 const char *errors)
3054{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003055 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003056 Py_ssize_t startinpos;
3057 Py_ssize_t endinpos;
3058 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 PyUnicodeObject *v;
3061 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003062 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003063 PyObject *errorHandler = NULL;
3064 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003065 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003066 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003067
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068 /* Default to Latin-1 */
3069 if (mapping == NULL)
3070 return PyUnicode_DecodeLatin1(s, size, errors);
3071
3072 v = _PyUnicode_New(size);
3073 if (v == NULL)
3074 goto onError;
3075 if (size == 0)
3076 return (PyObject *)v;
3077 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003078 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003079 if (PyUnicode_CheckExact(mapping)) {
3080 mapstring = PyUnicode_AS_UNICODE(mapping);
3081 maplen = PyUnicode_GET_SIZE(mapping);
3082 while (s < e) {
3083 unsigned char ch = *s;
3084 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003086 if (ch < maplen)
3087 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003089 if (x == 0xfffe) {
3090 /* undefined mapping */
3091 outpos = p-PyUnicode_AS_UNICODE(v);
3092 startinpos = s-starts;
3093 endinpos = startinpos+1;
3094 if (unicode_decode_call_errorhandler(
3095 errors, &errorHandler,
3096 "charmap", "character maps to <undefined>",
3097 starts, size, &startinpos, &endinpos, &exc, &s,
3098 (PyObject **)&v, &outpos, &p)) {
3099 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003100 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003101 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003102 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003103 *p++ = x;
3104 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003106 }
3107 else {
3108 while (s < e) {
3109 unsigned char ch = *s;
3110 PyObject *w, *x;
3111
3112 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3113 w = PyInt_FromLong((long)ch);
3114 if (w == NULL)
3115 goto onError;
3116 x = PyObject_GetItem(mapping, w);
3117 Py_DECREF(w);
3118 if (x == NULL) {
3119 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3120 /* No mapping found means: mapping is undefined. */
3121 PyErr_Clear();
3122 x = Py_None;
3123 Py_INCREF(x);
3124 } else
3125 goto onError;
3126 }
3127
3128 /* Apply mapping */
3129 if (PyInt_Check(x)) {
3130 long value = PyInt_AS_LONG(x);
3131 if (value < 0 || value > 65535) {
3132 PyErr_SetString(PyExc_TypeError,
3133 "character mapping must be in range(65536)");
3134 Py_DECREF(x);
3135 goto onError;
3136 }
3137 *p++ = (Py_UNICODE)value;
3138 }
3139 else if (x == Py_None) {
3140 /* undefined mapping */
3141 outpos = p-PyUnicode_AS_UNICODE(v);
3142 startinpos = s-starts;
3143 endinpos = startinpos+1;
3144 if (unicode_decode_call_errorhandler(
3145 errors, &errorHandler,
3146 "charmap", "character maps to <undefined>",
3147 starts, size, &startinpos, &endinpos, &exc, &s,
3148 (PyObject **)&v, &outpos, &p)) {
3149 Py_DECREF(x);
3150 goto onError;
3151 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003152 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003153 continue;
3154 }
3155 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003156 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003157
3158 if (targetsize == 1)
3159 /* 1-1 mapping */
3160 *p++ = *PyUnicode_AS_UNICODE(x);
3161
3162 else if (targetsize > 1) {
3163 /* 1-n mapping */
3164 if (targetsize > extrachars) {
3165 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003166 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3167 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003168 (targetsize << 2);
3169 extrachars += needed;
Armin Rigo4b63c212006-10-04 11:44:06 +00003170 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003171 if (_PyUnicode_Resize(&v,
3172 PyUnicode_GET_SIZE(v) + needed) < 0) {
3173 Py_DECREF(x);
3174 goto onError;
3175 }
3176 p = PyUnicode_AS_UNICODE(v) + oldpos;
3177 }
3178 Py_UNICODE_COPY(p,
3179 PyUnicode_AS_UNICODE(x),
3180 targetsize);
3181 p += targetsize;
3182 extrachars -= targetsize;
3183 }
3184 /* 1-0 mapping: skip the character */
3185 }
3186 else {
3187 /* wrong return value */
3188 PyErr_SetString(PyExc_TypeError,
3189 "character mapping must return integer, None or unicode");
3190 Py_DECREF(x);
3191 goto onError;
3192 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003193 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003194 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196 }
3197 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003198 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003200 Py_XDECREF(errorHandler);
3201 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003203
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003205 Py_XDECREF(errorHandler);
3206 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 Py_XDECREF(v);
3208 return NULL;
3209}
3210
Martin v. Löwis3f767792006-06-04 19:36:28 +00003211/* Charmap encoding: the lookup table */
3212
3213struct encoding_map{
3214 PyObject_HEAD
3215 unsigned char level1[32];
3216 int count2, count3;
3217 unsigned char level23[1];
3218};
3219
3220static PyObject*
3221encoding_map_size(PyObject *obj, PyObject* args)
3222{
3223 struct encoding_map *map = (struct encoding_map*)obj;
3224 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3225 128*map->count3);
3226}
3227
3228static PyMethodDef encoding_map_methods[] = {
3229 {"size", encoding_map_size, METH_NOARGS,
3230 PyDoc_STR("Return the size (in bytes) of this object") },
3231 { 0 }
3232};
3233
3234static void
3235encoding_map_dealloc(PyObject* o)
3236{
3237 PyObject_FREE(o);
3238}
3239
3240static PyTypeObject EncodingMapType = {
3241 PyObject_HEAD_INIT(NULL)
3242 0, /*ob_size*/
3243 "EncodingMap", /*tp_name*/
3244 sizeof(struct encoding_map), /*tp_basicsize*/
3245 0, /*tp_itemsize*/
3246 /* methods */
3247 encoding_map_dealloc, /*tp_dealloc*/
3248 0, /*tp_print*/
3249 0, /*tp_getattr*/
3250 0, /*tp_setattr*/
3251 0, /*tp_compare*/
3252 0, /*tp_repr*/
3253 0, /*tp_as_number*/
3254 0, /*tp_as_sequence*/
3255 0, /*tp_as_mapping*/
3256 0, /*tp_hash*/
3257 0, /*tp_call*/
3258 0, /*tp_str*/
3259 0, /*tp_getattro*/
3260 0, /*tp_setattro*/
3261 0, /*tp_as_buffer*/
3262 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3263 0, /*tp_doc*/
3264 0, /*tp_traverse*/
3265 0, /*tp_clear*/
3266 0, /*tp_richcompare*/
3267 0, /*tp_weaklistoffset*/
3268 0, /*tp_iter*/
3269 0, /*tp_iternext*/
3270 encoding_map_methods, /*tp_methods*/
3271 0, /*tp_members*/
3272 0, /*tp_getset*/
3273 0, /*tp_base*/
3274 0, /*tp_dict*/
3275 0, /*tp_descr_get*/
3276 0, /*tp_descr_set*/
3277 0, /*tp_dictoffset*/
3278 0, /*tp_init*/
3279 0, /*tp_alloc*/
3280 0, /*tp_new*/
3281 0, /*tp_free*/
3282 0, /*tp_is_gc*/
3283};
3284
3285PyObject*
3286PyUnicode_BuildEncodingMap(PyObject* string)
3287{
3288 Py_UNICODE *decode;
3289 PyObject *result;
3290 struct encoding_map *mresult;
3291 int i;
3292 int need_dict = 0;
3293 unsigned char level1[32];
3294 unsigned char level2[512];
3295 unsigned char *mlevel1, *mlevel2, *mlevel3;
3296 int count2 = 0, count3 = 0;
3297
3298 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3299 PyErr_BadArgument();
3300 return NULL;
3301 }
3302 decode = PyUnicode_AS_UNICODE(string);
3303 memset(level1, 0xFF, sizeof level1);
3304 memset(level2, 0xFF, sizeof level2);
3305
3306 /* If there isn't a one-to-one mapping of NULL to \0,
3307 or if there are non-BMP characters, we need to use
3308 a mapping dictionary. */
3309 if (decode[0] != 0)
3310 need_dict = 1;
3311 for (i = 1; i < 256; i++) {
3312 int l1, l2;
3313 if (decode[i] == 0
3314 #ifdef Py_UNICODE_WIDE
3315 || decode[i] > 0xFFFF
3316 #endif
3317 ) {
3318 need_dict = 1;
3319 break;
3320 }
3321 if (decode[i] == 0xFFFE)
3322 /* unmapped character */
3323 continue;
3324 l1 = decode[i] >> 11;
3325 l2 = decode[i] >> 7;
3326 if (level1[l1] == 0xFF)
3327 level1[l1] = count2++;
3328 if (level2[l2] == 0xFF)
3329 level2[l2] = count3++;
3330 }
3331
3332 if (count2 >= 0xFF || count3 >= 0xFF)
3333 need_dict = 1;
3334
3335 if (need_dict) {
3336 PyObject *result = PyDict_New();
3337 PyObject *key, *value;
3338 if (!result)
3339 return NULL;
3340 for (i = 0; i < 256; i++) {
3341 key = value = NULL;
3342 key = PyInt_FromLong(decode[i]);
3343 value = PyInt_FromLong(i);
3344 if (!key || !value)
3345 goto failed1;
3346 if (PyDict_SetItem(result, key, value) == -1)
3347 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00003348 Py_DECREF(key);
3349 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003350 }
3351 return result;
3352 failed1:
3353 Py_XDECREF(key);
3354 Py_XDECREF(value);
3355 Py_DECREF(result);
3356 return NULL;
3357 }
3358
3359 /* Create a three-level trie */
3360 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3361 16*count2 + 128*count3 - 1);
3362 if (!result)
3363 return PyErr_NoMemory();
3364 PyObject_Init(result, &EncodingMapType);
3365 mresult = (struct encoding_map*)result;
3366 mresult->count2 = count2;
3367 mresult->count3 = count3;
3368 mlevel1 = mresult->level1;
3369 mlevel2 = mresult->level23;
3370 mlevel3 = mresult->level23 + 16*count2;
3371 memcpy(mlevel1, level1, 32);
3372 memset(mlevel2, 0xFF, 16*count2);
3373 memset(mlevel3, 0, 128*count3);
3374 count3 = 0;
3375 for (i = 1; i < 256; i++) {
3376 int o1, o2, o3, i2, i3;
3377 if (decode[i] == 0xFFFE)
3378 /* unmapped character */
3379 continue;
3380 o1 = decode[i]>>11;
3381 o2 = (decode[i]>>7) & 0xF;
3382 i2 = 16*mlevel1[o1] + o2;
3383 if (mlevel2[i2] == 0xFF)
3384 mlevel2[i2] = count3++;
3385 o3 = decode[i] & 0x7F;
3386 i3 = 128*mlevel2[i2] + o3;
3387 mlevel3[i3] = i;
3388 }
3389 return result;
3390}
3391
3392static int
3393encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3394{
3395 struct encoding_map *map = (struct encoding_map*)mapping;
3396 int l1 = c>>11;
3397 int l2 = (c>>7) & 0xF;
3398 int l3 = c & 0x7F;
3399 int i;
3400
3401#ifdef Py_UNICODE_WIDE
3402 if (c > 0xFFFF) {
3403 return -1;
3404 }
3405#endif
3406 if (c == 0)
3407 return 0;
3408 /* level 1*/
3409 i = map->level1[l1];
3410 if (i == 0xFF) {
3411 return -1;
3412 }
3413 /* level 2*/
3414 i = map->level23[16*i+l2];
3415 if (i == 0xFF) {
3416 return -1;
3417 }
3418 /* level 3 */
3419 i = map->level23[16*map->count2 + 128*i + l3];
3420 if (i == 0) {
3421 return -1;
3422 }
3423 return i;
3424}
3425
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003426/* Lookup the character ch in the mapping. If the character
3427 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003428 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003430{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003431 PyObject *w = PyInt_FromLong((long)c);
3432 PyObject *x;
3433
3434 if (w == NULL)
3435 return NULL;
3436 x = PyObject_GetItem(mapping, w);
3437 Py_DECREF(w);
3438 if (x == NULL) {
3439 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3440 /* No mapping found means: mapping is undefined. */
3441 PyErr_Clear();
3442 x = Py_None;
3443 Py_INCREF(x);
3444 return x;
3445 } else
3446 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003448 else if (x == Py_None)
3449 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003450 else if (PyInt_Check(x)) {
3451 long value = PyInt_AS_LONG(x);
3452 if (value < 0 || value > 255) {
3453 PyErr_SetString(PyExc_TypeError,
3454 "character mapping must be in range(256)");
3455 Py_DECREF(x);
3456 return NULL;
3457 }
3458 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003460 else if (PyString_Check(x))
3461 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003463 /* wrong return value */
3464 PyErr_SetString(PyExc_TypeError,
3465 "character mapping must return integer, None or str");
3466 Py_DECREF(x);
3467 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 }
3469}
3470
Martin v. Löwis3f767792006-06-04 19:36:28 +00003471static int
3472charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3473{
3474 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3475 /* exponentially overallocate to minimize reallocations */
3476 if (requiredsize < 2*outsize)
3477 requiredsize = 2*outsize;
3478 if (_PyString_Resize(outobj, requiredsize)) {
3479 return 0;
3480 }
3481 return 1;
3482}
3483
3484typedef enum charmapencode_result {
3485 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3486}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487/* lookup the character, put the result in the output string and adjust
3488 various state variables. Reallocate the output string if not enough
3489 space is available. Return a new reference to the object that
3490 was put in the output buffer, or Py_None, if the mapping was undefined
3491 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003492 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003493static
Martin v. Löwis3f767792006-06-04 19:36:28 +00003494charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003495 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496{
Martin v. Löwis3f767792006-06-04 19:36:28 +00003497 PyObject *rep;
3498 char *outstart;
3499 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003500
Martin v. Löwis3f767792006-06-04 19:36:28 +00003501 if (mapping->ob_type == &EncodingMapType) {
3502 int res = encoding_map_lookup(c, mapping);
3503 Py_ssize_t requiredsize = *outpos+1;
3504 if (res == -1)
3505 return enc_FAILED;
3506 if (outsize<requiredsize)
3507 if (!charmapencode_resize(outobj, outpos, requiredsize))
3508 return enc_EXCEPTION;
3509 outstart = PyString_AS_STRING(*outobj);
3510 outstart[(*outpos)++] = (char)res;
3511 return enc_SUCCESS;
3512 }
3513
3514 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003515 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003516 return enc_EXCEPTION;
3517 else if (rep==Py_None) {
3518 Py_DECREF(rep);
3519 return enc_FAILED;
3520 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003522 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003523 if (outsize<requiredsize)
3524 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003526 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003528 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3530 }
3531 else {
3532 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003533 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3534 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003535 if (outsize<requiredsize)
3536 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003538 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003540 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 memcpy(outstart + *outpos, repchars, repsize);
3542 *outpos += repsize;
3543 }
3544 }
Georg Brandl9f167602006-06-04 21:46:16 +00003545 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003546 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003547}
3548
3549/* handle an error in PyUnicode_EncodeCharmap
3550 Return 0 on success, -1 on error */
3551static
3552int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003553 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003555 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003556 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557{
3558 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003559 Py_ssize_t repsize;
3560 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561 Py_UNICODE *uni2;
3562 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003563 Py_ssize_t collstartpos = *inpos;
3564 Py_ssize_t collendpos = *inpos+1;
3565 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566 char *encoding = "charmap";
3567 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00003568 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 /* find all unencodable characters */
3571 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003572 PyObject *rep;
3573 if (mapping->ob_type == &EncodingMapType) {
3574 int res = encoding_map_lookup(p[collendpos], mapping);
3575 if (res != -1)
3576 break;
3577 ++collendpos;
3578 continue;
3579 }
3580
3581 rep = charmapencode_lookup(p[collendpos], mapping);
3582 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003583 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003584 else if (rep!=Py_None) {
3585 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003586 break;
3587 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003588 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003589 ++collendpos;
3590 }
3591 /* cache callback name lookup
3592 * (if not done yet, i.e. it's the first error) */
3593 if (*known_errorHandler==-1) {
3594 if ((errors==NULL) || (!strcmp(errors, "strict")))
3595 *known_errorHandler = 1;
3596 else if (!strcmp(errors, "replace"))
3597 *known_errorHandler = 2;
3598 else if (!strcmp(errors, "ignore"))
3599 *known_errorHandler = 3;
3600 else if (!strcmp(errors, "xmlcharrefreplace"))
3601 *known_errorHandler = 4;
3602 else
3603 *known_errorHandler = 0;
3604 }
3605 switch (*known_errorHandler) {
3606 case 1: /* strict */
3607 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3608 return -1;
3609 case 2: /* replace */
3610 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3611 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003612 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003613 return -1;
3614 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003615 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003616 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3617 return -1;
3618 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003619 }
3620 /* fall through */
3621 case 3: /* ignore */
3622 *inpos = collendpos;
3623 break;
3624 case 4: /* xmlcharrefreplace */
3625 /* generate replacement (temporarily (mis)uses p) */
3626 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3627 char buffer[2+29+1+1];
3628 char *cp;
3629 sprintf(buffer, "&#%d;", (int)p[collpos]);
3630 for (cp = buffer; *cp; ++cp) {
3631 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003632 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003634 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3636 return -1;
3637 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003638 }
3639 }
3640 *inpos = collendpos;
3641 break;
3642 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003643 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644 encoding, reason, p, size, exceptionObject,
3645 collstartpos, collendpos, &newpos);
3646 if (repunicode == NULL)
3647 return -1;
3648 /* generate replacement */
3649 repsize = PyUnicode_GET_SIZE(repunicode);
3650 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3651 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003652 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653 return -1;
3654 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003655 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003656 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3658 return -1;
3659 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 }
3661 *inpos = newpos;
3662 Py_DECREF(repunicode);
3663 }
3664 return 0;
3665}
3666
Guido van Rossumd57fd912000-03-10 22:53:23 +00003667PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003668 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003669 PyObject *mapping,
3670 const char *errors)
3671{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003672 /* output object */
3673 PyObject *res = NULL;
3674 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003675 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003677 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003678 PyObject *errorHandler = NULL;
3679 PyObject *exc = NULL;
3680 /* the following variable is used for caching string comparisons
3681 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3682 * 3=ignore, 4=xmlcharrefreplace */
3683 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684
3685 /* Default to Latin-1 */
3686 if (mapping == NULL)
3687 return PyUnicode_EncodeLatin1(p, size, errors);
3688
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003689 /* allocate enough for a simple encoding without
3690 replacements, if we need more, we'll resize */
3691 res = PyString_FromStringAndSize(NULL, size);
3692 if (res == NULL)
3693 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003694 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003697 while (inpos<size) {
3698 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00003699 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3700 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003702 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003703 if (charmap_encoding_error(p, size, &inpos, mapping,
3704 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003705 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003706 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003707 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003710 else
3711 /* done with this character => adjust input position */
3712 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003714
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003715 /* Resize if we allocated to much */
3716 if (respos<PyString_GET_SIZE(res)) {
3717 if (_PyString_Resize(&res, respos))
3718 goto onError;
3719 }
3720 Py_XDECREF(exc);
3721 Py_XDECREF(errorHandler);
3722 return res;
3723
3724 onError:
3725 Py_XDECREF(res);
3726 Py_XDECREF(exc);
3727 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003728 return NULL;
3729}
3730
3731PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3732 PyObject *mapping)
3733{
3734 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3735 PyErr_BadArgument();
3736 return NULL;
3737 }
3738 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3739 PyUnicode_GET_SIZE(unicode),
3740 mapping,
3741 NULL);
3742}
3743
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003744/* create or adjust a UnicodeTranslateError */
3745static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003746 const Py_UNICODE *unicode, Py_ssize_t size,
3747 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003748 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003750 if (*exceptionObject == NULL) {
3751 *exceptionObject = PyUnicodeTranslateError_Create(
3752 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753 }
3754 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003755 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3756 goto onError;
3757 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3758 goto onError;
3759 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3760 goto onError;
3761 return;
3762 onError:
3763 Py_DECREF(*exceptionObject);
3764 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765 }
3766}
3767
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003768/* raises a UnicodeTranslateError */
3769static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003770 const Py_UNICODE *unicode, Py_ssize_t size,
3771 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003772 const char *reason)
3773{
3774 make_translate_exception(exceptionObject,
3775 unicode, size, startpos, endpos, reason);
3776 if (*exceptionObject != NULL)
3777 PyCodec_StrictErrors(*exceptionObject);
3778}
3779
3780/* error handling callback helper:
3781 build arguments, call the callback and check the arguments,
3782 put the result into newpos and return the replacement string, which
3783 has to be freed by the caller */
3784static PyObject *unicode_translate_call_errorhandler(const char *errors,
3785 PyObject **errorHandler,
3786 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003787 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3788 Py_ssize_t startpos, Py_ssize_t endpos,
3789 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003790{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003791 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003792
Martin v. Löwis412fb672006-04-13 06:34:32 +00003793 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003794 PyObject *restuple;
3795 PyObject *resunicode;
3796
3797 if (*errorHandler == NULL) {
3798 *errorHandler = PyCodec_LookupError(errors);
3799 if (*errorHandler == NULL)
3800 return NULL;
3801 }
3802
3803 make_translate_exception(exceptionObject,
3804 unicode, size, startpos, endpos, reason);
3805 if (*exceptionObject == NULL)
3806 return NULL;
3807
3808 restuple = PyObject_CallFunctionObjArgs(
3809 *errorHandler, *exceptionObject, NULL);
3810 if (restuple == NULL)
3811 return NULL;
3812 if (!PyTuple_Check(restuple)) {
3813 PyErr_Format(PyExc_TypeError, &argparse[4]);
3814 Py_DECREF(restuple);
3815 return NULL;
3816 }
3817 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003818 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003819 Py_DECREF(restuple);
3820 return NULL;
3821 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003822 if (i_newpos<0)
3823 *newpos = size+i_newpos;
3824 else
3825 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003826 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003827 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003828 Py_DECREF(restuple);
3829 return NULL;
3830 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003831 Py_INCREF(resunicode);
3832 Py_DECREF(restuple);
3833 return resunicode;
3834}
3835
3836/* Lookup the character ch in the mapping and put the result in result,
3837 which must be decrefed by the caller.
3838 Return 0 on success, -1 on error */
3839static
3840int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3841{
3842 PyObject *w = PyInt_FromLong((long)c);
3843 PyObject *x;
3844
3845 if (w == NULL)
3846 return -1;
3847 x = PyObject_GetItem(mapping, w);
3848 Py_DECREF(w);
3849 if (x == NULL) {
3850 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3851 /* No mapping found means: use 1:1 mapping. */
3852 PyErr_Clear();
3853 *result = NULL;
3854 return 0;
3855 } else
3856 return -1;
3857 }
3858 else if (x == Py_None) {
3859 *result = x;
3860 return 0;
3861 }
3862 else if (PyInt_Check(x)) {
3863 long value = PyInt_AS_LONG(x);
3864 long max = PyUnicode_GetMax();
3865 if (value < 0 || value > max) {
3866 PyErr_Format(PyExc_TypeError,
3867 "character mapping must be in range(0x%lx)", max+1);
3868 Py_DECREF(x);
3869 return -1;
3870 }
3871 *result = x;
3872 return 0;
3873 }
3874 else if (PyUnicode_Check(x)) {
3875 *result = x;
3876 return 0;
3877 }
3878 else {
3879 /* wrong return value */
3880 PyErr_SetString(PyExc_TypeError,
3881 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003882 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003883 return -1;
3884 }
3885}
3886/* ensure that *outobj is at least requiredsize characters long,
3887if not reallocate and adjust various state variables.
3888Return 0 on success, -1 on error */
3889static
Walter Dörwald4894c302003-10-24 14:25:28 +00003890int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003891 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003892{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003893 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003894 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003895 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003896 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003897 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003898 if (requiredsize < 2 * oldsize)
3899 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003900 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003901 return -1;
3902 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003903 }
3904 return 0;
3905}
3906/* lookup the character, put the result in the output string and adjust
3907 various state variables. Return a new reference to the object that
3908 was put in the output buffer in *result, or Py_None, if the mapping was
3909 undefined (in which case no character was written).
3910 The called must decref result.
3911 Return 0 on success, -1 on error. */
3912static
Walter Dörwald4894c302003-10-24 14:25:28 +00003913int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003914 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003915 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003916{
Walter Dörwald4894c302003-10-24 14:25:28 +00003917 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003918 return -1;
3919 if (*res==NULL) {
3920 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003921 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003922 }
3923 else if (*res==Py_None)
3924 ;
3925 else if (PyInt_Check(*res)) {
3926 /* no overflow check, because we know that the space is enough */
3927 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3928 }
3929 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003930 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931 if (repsize==1) {
3932 /* no overflow check, because we know that the space is enough */
3933 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3934 }
3935 else if (repsize!=0) {
3936 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003937 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003938 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003939 repsize - 1;
3940 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003941 return -1;
3942 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3943 *outp += repsize;
3944 }
3945 }
3946 else
3947 return -1;
3948 return 0;
3949}
3950
3951PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003952 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 PyObject *mapping,
3954 const char *errors)
3955{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003956 /* output object */
3957 PyObject *res = NULL;
3958 /* pointers to the beginning and end+1 of input */
3959 const Py_UNICODE *startp = p;
3960 const Py_UNICODE *endp = p + size;
3961 /* pointer into the output */
3962 Py_UNICODE *str;
3963 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003964 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003965 char *reason = "character maps to <undefined>";
3966 PyObject *errorHandler = NULL;
3967 PyObject *exc = NULL;
3968 /* the following variable is used for caching string comparisons
3969 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3970 * 3=ignore, 4=xmlcharrefreplace */
3971 int known_errorHandler = -1;
3972
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973 if (mapping == NULL) {
3974 PyErr_BadArgument();
3975 return NULL;
3976 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977
3978 /* allocate enough for a simple 1:1 translation without
3979 replacements, if we need more, we'll resize */
3980 res = PyUnicode_FromUnicode(NULL, size);
3981 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003982 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984 return res;
3985 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003987 while (p<endp) {
3988 /* try to encode it */
3989 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003990 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003991 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992 goto onError;
3993 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003994 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003995 if (x!=Py_None) /* it worked => adjust input pointer */
3996 ++p;
3997 else { /* untranslatable character */
3998 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003999 Py_ssize_t repsize;
4000 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004001 Py_UNICODE *uni2;
4002 /* startpos for collecting untranslatable chars */
4003 const Py_UNICODE *collstart = p;
4004 const Py_UNICODE *collend = p+1;
4005 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004006
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004007 /* find all untranslatable characters */
4008 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004009 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004010 goto onError;
4011 Py_XDECREF(x);
4012 if (x!=Py_None)
4013 break;
4014 ++collend;
4015 }
4016 /* cache callback name lookup
4017 * (if not done yet, i.e. it's the first error) */
4018 if (known_errorHandler==-1) {
4019 if ((errors==NULL) || (!strcmp(errors, "strict")))
4020 known_errorHandler = 1;
4021 else if (!strcmp(errors, "replace"))
4022 known_errorHandler = 2;
4023 else if (!strcmp(errors, "ignore"))
4024 known_errorHandler = 3;
4025 else if (!strcmp(errors, "xmlcharrefreplace"))
4026 known_errorHandler = 4;
4027 else
4028 known_errorHandler = 0;
4029 }
4030 switch (known_errorHandler) {
4031 case 1: /* strict */
4032 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4033 goto onError;
4034 case 2: /* replace */
4035 /* No need to check for space, this is a 1:1 replacement */
4036 for (coll = collstart; coll<collend; ++coll)
4037 *str++ = '?';
4038 /* fall through */
4039 case 3: /* ignore */
4040 p = collend;
4041 break;
4042 case 4: /* xmlcharrefreplace */
4043 /* generate replacement (temporarily (mis)uses p) */
4044 for (p = collstart; p < collend; ++p) {
4045 char buffer[2+29+1+1];
4046 char *cp;
4047 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004048 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004049 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4050 goto onError;
4051 for (cp = buffer; *cp; ++cp)
4052 *str++ = *cp;
4053 }
4054 p = collend;
4055 break;
4056 default:
4057 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4058 reason, startp, size, &exc,
4059 collstart-startp, collend-startp, &newpos);
4060 if (repunicode == NULL)
4061 goto onError;
4062 /* generate replacement */
4063 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004064 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004065 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4066 Py_DECREF(repunicode);
4067 goto onError;
4068 }
4069 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4070 *str++ = *uni2;
4071 p = startp + newpos;
4072 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 }
4074 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004076 /* Resize if we allocated to much */
4077 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004078 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004079 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004080 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004081 }
4082 Py_XDECREF(exc);
4083 Py_XDECREF(errorHandler);
4084 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004086 onError:
4087 Py_XDECREF(res);
4088 Py_XDECREF(exc);
4089 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090 return NULL;
4091}
4092
4093PyObject *PyUnicode_Translate(PyObject *str,
4094 PyObject *mapping,
4095 const char *errors)
4096{
4097 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004098
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099 str = PyUnicode_FromObject(str);
4100 if (str == NULL)
4101 goto onError;
4102 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4103 PyUnicode_GET_SIZE(str),
4104 mapping,
4105 errors);
4106 Py_DECREF(str);
4107 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004108
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109 onError:
4110 Py_XDECREF(str);
4111 return NULL;
4112}
Tim Petersced69f82003-09-16 20:30:58 +00004113
Guido van Rossum9e896b32000-04-05 20:11:21 +00004114/* --- Decimal Encoder ---------------------------------------------------- */
4115
4116int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004117 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004118 char *output,
4119 const char *errors)
4120{
4121 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004122 PyObject *errorHandler = NULL;
4123 PyObject *exc = NULL;
4124 const char *encoding = "decimal";
4125 const char *reason = "invalid decimal Unicode string";
4126 /* the following variable is used for caching string comparisons
4127 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4128 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004129
4130 if (output == NULL) {
4131 PyErr_BadArgument();
4132 return -1;
4133 }
4134
4135 p = s;
4136 end = s + length;
4137 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004139 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004140 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004141 Py_ssize_t repsize;
4142 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 Py_UNICODE *uni2;
4144 Py_UNICODE *collstart;
4145 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004146
Guido van Rossum9e896b32000-04-05 20:11:21 +00004147 if (Py_UNICODE_ISSPACE(ch)) {
4148 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004150 continue;
4151 }
4152 decimal = Py_UNICODE_TODECIMAL(ch);
4153 if (decimal >= 0) {
4154 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004156 continue;
4157 }
Guido van Rossumba477042000-04-06 18:18:10 +00004158 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004159 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004161 continue;
4162 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163 /* All other characters are considered unencodable */
4164 collstart = p;
4165 collend = p+1;
4166 while (collend < end) {
4167 if ((0 < *collend && *collend < 256) ||
4168 !Py_UNICODE_ISSPACE(*collend) ||
4169 Py_UNICODE_TODECIMAL(*collend))
4170 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004171 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172 /* cache callback name lookup
4173 * (if not done yet, i.e. it's the first error) */
4174 if (known_errorHandler==-1) {
4175 if ((errors==NULL) || (!strcmp(errors, "strict")))
4176 known_errorHandler = 1;
4177 else if (!strcmp(errors, "replace"))
4178 known_errorHandler = 2;
4179 else if (!strcmp(errors, "ignore"))
4180 known_errorHandler = 3;
4181 else if (!strcmp(errors, "xmlcharrefreplace"))
4182 known_errorHandler = 4;
4183 else
4184 known_errorHandler = 0;
4185 }
4186 switch (known_errorHandler) {
4187 case 1: /* strict */
4188 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4189 goto onError;
4190 case 2: /* replace */
4191 for (p = collstart; p < collend; ++p)
4192 *output++ = '?';
4193 /* fall through */
4194 case 3: /* ignore */
4195 p = collend;
4196 break;
4197 case 4: /* xmlcharrefreplace */
4198 /* generate replacement (temporarily (mis)uses p) */
4199 for (p = collstart; p < collend; ++p)
4200 output += sprintf(output, "&#%d;", (int)*p);
4201 p = collend;
4202 break;
4203 default:
4204 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4205 encoding, reason, s, length, &exc,
4206 collstart-s, collend-s, &newpos);
4207 if (repunicode == NULL)
4208 goto onError;
4209 /* generate replacement */
4210 repsize = PyUnicode_GET_SIZE(repunicode);
4211 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4212 Py_UNICODE ch = *uni2;
4213 if (Py_UNICODE_ISSPACE(ch))
4214 *output++ = ' ';
4215 else {
4216 decimal = Py_UNICODE_TODECIMAL(ch);
4217 if (decimal >= 0)
4218 *output++ = '0' + decimal;
4219 else if (0 < ch && ch < 256)
4220 *output++ = (char)ch;
4221 else {
4222 Py_DECREF(repunicode);
4223 raise_encode_exception(&exc, encoding,
4224 s, length, collstart-s, collend-s, reason);
4225 goto onError;
4226 }
4227 }
4228 }
4229 p = s + newpos;
4230 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004231 }
4232 }
4233 /* 0-terminate the output string */
4234 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235 Py_XDECREF(exc);
4236 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004237 return 0;
4238
4239 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004240 Py_XDECREF(exc);
4241 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004242 return -1;
4243}
4244
Guido van Rossumd57fd912000-03-10 22:53:23 +00004245/* --- Helpers ------------------------------------------------------------ */
4246
Fredrik Lundha50d2012006-05-26 17:04:58 +00004247#define STRINGLIB_CHAR Py_UNICODE
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004248
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004249#define STRINGLIB_LEN PyUnicode_GET_SIZE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004250#define STRINGLIB_NEW PyUnicode_FromUnicode
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004251#define STRINGLIB_STR PyUnicode_AS_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004252
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004253Py_LOCAL_INLINE(int)
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004254STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4255{
Fredrik Lundh9c0e9c02006-05-26 18:24:15 +00004256 if (str[0] != other[0])
4257 return 1;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004258 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4259}
4260
Fredrik Lundhb9479482006-05-26 17:22:38 +00004261#define STRINGLIB_EMPTY unicode_empty
4262
Fredrik Lundha50d2012006-05-26 17:04:58 +00004263#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004264
4265#include "stringlib/count.h"
4266#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00004267#include "stringlib/partition.h"
4268
Fredrik Lundhc8162812006-05-26 19:33:03 +00004269/* helper macro to fixup start/end slice values */
4270#define FIX_START_END(obj) \
4271 if (start < 0) \
4272 start += (obj)->length; \
4273 if (start < 0) \
4274 start = 0; \
4275 if (end > (obj)->length) \
4276 end = (obj)->length; \
4277 if (end < 0) \
4278 end += (obj)->length; \
4279 if (end < 0) \
4280 end = 0;
4281
Martin v. Löwis18e16552006-02-15 17:27:45 +00004282Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004283 PyObject *substr,
4284 Py_ssize_t start,
4285 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004287 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004288 PyUnicodeObject* str_obj;
4289 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004290
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004291 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4292 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004294 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4295 if (!sub_obj) {
4296 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 return -1;
4298 }
Tim Petersced69f82003-09-16 20:30:58 +00004299
Fredrik Lundhc8162812006-05-26 19:33:03 +00004300 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004301
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004302 result = stringlib_count(
4303 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4304 );
4305
4306 Py_DECREF(sub_obj);
4307 Py_DECREF(str_obj);
4308
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309 return result;
4310}
4311
Martin v. Löwis18e16552006-02-15 17:27:45 +00004312Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004313 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004314 Py_ssize_t start,
4315 Py_ssize_t end,
4316 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004318 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004319
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004320 str = PyUnicode_FromObject(str);
4321 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004322 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004323 sub = PyUnicode_FromObject(sub);
4324 if (!sub) {
4325 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004326 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327 }
Tim Petersced69f82003-09-16 20:30:58 +00004328
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004329 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004330 result = stringlib_find_slice(
4331 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4332 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4333 start, end
4334 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004335 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004336 result = stringlib_rfind_slice(
4337 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4338 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4339 start, end
4340 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004341
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004342 Py_DECREF(str);
4343 Py_DECREF(sub);
4344
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345 return result;
4346}
4347
Tim Petersced69f82003-09-16 20:30:58 +00004348static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349int tailmatch(PyUnicodeObject *self,
4350 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004351 Py_ssize_t start,
4352 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353 int direction)
4354{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355 if (substring->length == 0)
4356 return 1;
4357
Fredrik Lundhc8162812006-05-26 19:33:03 +00004358 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004359
4360 end -= substring->length;
4361 if (end < start)
4362 return 0;
4363
4364 if (direction > 0) {
4365 if (Py_UNICODE_MATCH(self, end, substring))
4366 return 1;
4367 } else {
4368 if (Py_UNICODE_MATCH(self, start, substring))
4369 return 1;
4370 }
4371
4372 return 0;
4373}
4374
Martin v. Löwis18e16552006-02-15 17:27:45 +00004375Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004377 Py_ssize_t start,
4378 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379 int direction)
4380{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004381 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004382
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383 str = PyUnicode_FromObject(str);
4384 if (str == NULL)
4385 return -1;
4386 substr = PyUnicode_FromObject(substr);
4387 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004388 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004389 return -1;
4390 }
Tim Petersced69f82003-09-16 20:30:58 +00004391
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392 result = tailmatch((PyUnicodeObject *)str,
4393 (PyUnicodeObject *)substr,
4394 start, end, direction);
4395 Py_DECREF(str);
4396 Py_DECREF(substr);
4397 return result;
4398}
4399
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400/* Apply fixfct filter to the Unicode object self and return a
4401 reference to the modified object */
4402
Tim Petersced69f82003-09-16 20:30:58 +00004403static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404PyObject *fixup(PyUnicodeObject *self,
4405 int (*fixfct)(PyUnicodeObject *s))
4406{
4407
4408 PyUnicodeObject *u;
4409
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004410 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411 if (u == NULL)
4412 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004413
4414 Py_UNICODE_COPY(u->str, self->str, self->length);
4415
Tim Peters7a29bd52001-09-12 03:03:31 +00004416 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417 /* fixfct should return TRUE if it modified the buffer. If
4418 FALSE, return a reference to the original buffer instead
4419 (to save space, not time) */
4420 Py_INCREF(self);
4421 Py_DECREF(u);
4422 return (PyObject*) self;
4423 }
4424 return (PyObject*) u;
4425}
4426
Tim Petersced69f82003-09-16 20:30:58 +00004427static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428int fixupper(PyUnicodeObject *self)
4429{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004430 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431 Py_UNICODE *s = self->str;
4432 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004433
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434 while (len-- > 0) {
4435 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004436
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437 ch = Py_UNICODE_TOUPPER(*s);
4438 if (ch != *s) {
4439 status = 1;
4440 *s = ch;
4441 }
4442 s++;
4443 }
4444
4445 return status;
4446}
4447
Tim Petersced69f82003-09-16 20:30:58 +00004448static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449int fixlower(PyUnicodeObject *self)
4450{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004451 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 Py_UNICODE *s = self->str;
4453 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004454
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 while (len-- > 0) {
4456 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004457
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 ch = Py_UNICODE_TOLOWER(*s);
4459 if (ch != *s) {
4460 status = 1;
4461 *s = ch;
4462 }
4463 s++;
4464 }
4465
4466 return status;
4467}
4468
Tim Petersced69f82003-09-16 20:30:58 +00004469static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470int fixswapcase(PyUnicodeObject *self)
4471{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004472 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473 Py_UNICODE *s = self->str;
4474 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004475
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 while (len-- > 0) {
4477 if (Py_UNICODE_ISUPPER(*s)) {
4478 *s = Py_UNICODE_TOLOWER(*s);
4479 status = 1;
4480 } else if (Py_UNICODE_ISLOWER(*s)) {
4481 *s = Py_UNICODE_TOUPPER(*s);
4482 status = 1;
4483 }
4484 s++;
4485 }
4486
4487 return status;
4488}
4489
Tim Petersced69f82003-09-16 20:30:58 +00004490static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491int fixcapitalize(PyUnicodeObject *self)
4492{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004493 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004494 Py_UNICODE *s = self->str;
4495 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004496
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004497 if (len == 0)
4498 return 0;
4499 if (Py_UNICODE_ISLOWER(*s)) {
4500 *s = Py_UNICODE_TOUPPER(*s);
4501 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004503 s++;
4504 while (--len > 0) {
4505 if (Py_UNICODE_ISUPPER(*s)) {
4506 *s = Py_UNICODE_TOLOWER(*s);
4507 status = 1;
4508 }
4509 s++;
4510 }
4511 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512}
4513
4514static
4515int fixtitle(PyUnicodeObject *self)
4516{
4517 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4518 register Py_UNICODE *e;
4519 int previous_is_cased;
4520
4521 /* Shortcut for single character strings */
4522 if (PyUnicode_GET_SIZE(self) == 1) {
4523 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4524 if (*p != ch) {
4525 *p = ch;
4526 return 1;
4527 }
4528 else
4529 return 0;
4530 }
Tim Petersced69f82003-09-16 20:30:58 +00004531
Guido van Rossumd57fd912000-03-10 22:53:23 +00004532 e = p + PyUnicode_GET_SIZE(self);
4533 previous_is_cased = 0;
4534 for (; p < e; p++) {
4535 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004536
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537 if (previous_is_cased)
4538 *p = Py_UNICODE_TOLOWER(ch);
4539 else
4540 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004541
4542 if (Py_UNICODE_ISLOWER(ch) ||
4543 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004544 Py_UNICODE_ISTITLE(ch))
4545 previous_is_cased = 1;
4546 else
4547 previous_is_cased = 0;
4548 }
4549 return 1;
4550}
4551
Tim Peters8ce9f162004-08-27 01:49:32 +00004552PyObject *
4553PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554{
Tim Peters8ce9f162004-08-27 01:49:32 +00004555 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004556 const Py_UNICODE blank = ' ';
4557 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004558 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004559 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004560 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4561 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004562 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4563 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004564 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004565 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004566 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567
Tim Peters05eba1f2004-08-27 21:32:02 +00004568 fseq = PySequence_Fast(seq, "");
4569 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004570 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004571 }
4572
Tim Peters91879ab2004-08-27 22:35:44 +00004573 /* Grrrr. A codec may be invoked to convert str objects to
4574 * Unicode, and so it's possible to call back into Python code
4575 * during PyUnicode_FromObject(), and so it's possible for a sick
4576 * codec to change the size of fseq (if seq is a list). Therefore
4577 * we have to keep refetching the size -- can't assume seqlen
4578 * is invariant.
4579 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004580 seqlen = PySequence_Fast_GET_SIZE(fseq);
4581 /* If empty sequence, return u"". */
4582 if (seqlen == 0) {
4583 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4584 goto Done;
4585 }
4586 /* If singleton sequence with an exact Unicode, return that. */
4587 if (seqlen == 1) {
4588 item = PySequence_Fast_GET_ITEM(fseq, 0);
4589 if (PyUnicode_CheckExact(item)) {
4590 Py_INCREF(item);
4591 res = (PyUnicodeObject *)item;
4592 goto Done;
4593 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004594 }
4595
Tim Peters05eba1f2004-08-27 21:32:02 +00004596 /* At least two items to join, or one that isn't exact Unicode. */
4597 if (seqlen > 1) {
4598 /* Set up sep and seplen -- they're needed. */
4599 if (separator == NULL) {
4600 sep = &blank;
4601 seplen = 1;
4602 }
4603 else {
4604 internal_separator = PyUnicode_FromObject(separator);
4605 if (internal_separator == NULL)
4606 goto onError;
4607 sep = PyUnicode_AS_UNICODE(internal_separator);
4608 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004609 /* In case PyUnicode_FromObject() mutated seq. */
4610 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004611 }
4612 }
4613
4614 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004615 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004616 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004617 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004618 res_p = PyUnicode_AS_UNICODE(res);
4619 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004620
Tim Peters05eba1f2004-08-27 21:32:02 +00004621 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004622 Py_ssize_t itemlen;
4623 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004624
4625 item = PySequence_Fast_GET_ITEM(fseq, i);
4626 /* Convert item to Unicode. */
4627 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4628 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004629 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004630 " %.80s found",
4631 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004632 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004633 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004634 item = PyUnicode_FromObject(item);
4635 if (item == NULL)
4636 goto onError;
4637 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004638
Tim Peters91879ab2004-08-27 22:35:44 +00004639 /* In case PyUnicode_FromObject() mutated seq. */
4640 seqlen = PySequence_Fast_GET_SIZE(fseq);
4641
Tim Peters8ce9f162004-08-27 01:49:32 +00004642 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004643 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004644 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004645 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004646 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004647 if (i < seqlen - 1) {
4648 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004649 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004650 goto Overflow;
4651 }
4652 if (new_res_used > res_alloc) {
4653 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004654 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004655 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004656 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004657 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004658 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004659 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004660 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004661 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004662 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004663 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004665
4666 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004667 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004668 res_p += itemlen;
4669 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004670 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004671 res_p += seplen;
4672 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004673 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004674 res_used = new_res_used;
4675 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004676
Tim Peters05eba1f2004-08-27 21:32:02 +00004677 /* Shrink res to match the used area; this probably can't fail,
4678 * but it's cheap to check.
4679 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004680 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004681 goto onError;
4682
4683 Done:
4684 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004685 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004686 return (PyObject *)res;
4687
Tim Peters8ce9f162004-08-27 01:49:32 +00004688 Overflow:
4689 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00004690 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004691 Py_DECREF(item);
4692 /* fall through */
4693
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004695 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004696 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004697 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698 return NULL;
4699}
4700
Tim Petersced69f82003-09-16 20:30:58 +00004701static
4702PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004703 Py_ssize_t left,
4704 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705 Py_UNICODE fill)
4706{
4707 PyUnicodeObject *u;
4708
4709 if (left < 0)
4710 left = 0;
4711 if (right < 0)
4712 right = 0;
4713
Tim Peters7a29bd52001-09-12 03:03:31 +00004714 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004715 Py_INCREF(self);
4716 return self;
4717 }
4718
4719 u = _PyUnicode_New(left + self->length + right);
4720 if (u) {
4721 if (left)
4722 Py_UNICODE_FILL(u->str, fill, left);
4723 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4724 if (right)
4725 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4726 }
4727
4728 return u;
4729}
4730
4731#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004732 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733 if (!str) \
4734 goto onError; \
4735 if (PyList_Append(list, str)) { \
4736 Py_DECREF(str); \
4737 goto onError; \
4738 } \
4739 else \
4740 Py_DECREF(str);
4741
4742static
4743PyObject *split_whitespace(PyUnicodeObject *self,
4744 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004745 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004747 register Py_ssize_t i;
4748 register Py_ssize_t j;
4749 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750 PyObject *str;
4751
4752 for (i = j = 0; i < len; ) {
4753 /* find a token */
4754 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4755 i++;
4756 j = i;
4757 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4758 i++;
4759 if (j < i) {
4760 if (maxcount-- <= 0)
4761 break;
4762 SPLIT_APPEND(self->str, j, i);
4763 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4764 i++;
4765 j = i;
4766 }
4767 }
4768 if (j < len) {
4769 SPLIT_APPEND(self->str, j, len);
4770 }
4771 return list;
4772
4773 onError:
4774 Py_DECREF(list);
4775 return NULL;
4776}
4777
4778PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004779 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004781 register Py_ssize_t i;
4782 register Py_ssize_t j;
4783 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784 PyObject *list;
4785 PyObject *str;
4786 Py_UNICODE *data;
4787
4788 string = PyUnicode_FromObject(string);
4789 if (string == NULL)
4790 return NULL;
4791 data = PyUnicode_AS_UNICODE(string);
4792 len = PyUnicode_GET_SIZE(string);
4793
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794 list = PyList_New(0);
4795 if (!list)
4796 goto onError;
4797
4798 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004799 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004800
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004802 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804
4805 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004806 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807 if (i < len) {
4808 if (data[i] == '\r' && i + 1 < len &&
4809 data[i+1] == '\n')
4810 i += 2;
4811 else
4812 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004813 if (keepends)
4814 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815 }
Guido van Rossum86662912000-04-11 15:38:46 +00004816 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 j = i;
4818 }
4819 if (j < len) {
4820 SPLIT_APPEND(data, j, len);
4821 }
4822
4823 Py_DECREF(string);
4824 return list;
4825
4826 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004827 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828 Py_DECREF(string);
4829 return NULL;
4830}
4831
Tim Petersced69f82003-09-16 20:30:58 +00004832static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833PyObject *split_char(PyUnicodeObject *self,
4834 PyObject *list,
4835 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004836 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004838 register Py_ssize_t i;
4839 register Py_ssize_t j;
4840 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 PyObject *str;
4842
4843 for (i = j = 0; i < len; ) {
4844 if (self->str[i] == ch) {
4845 if (maxcount-- <= 0)
4846 break;
4847 SPLIT_APPEND(self->str, j, i);
4848 i = j = i + 1;
4849 } else
4850 i++;
4851 }
4852 if (j <= len) {
4853 SPLIT_APPEND(self->str, j, len);
4854 }
4855 return list;
4856
4857 onError:
4858 Py_DECREF(list);
4859 return NULL;
4860}
4861
Tim Petersced69f82003-09-16 20:30:58 +00004862static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863PyObject *split_substring(PyUnicodeObject *self,
4864 PyObject *list,
4865 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004866 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004868 register Py_ssize_t i;
4869 register Py_ssize_t j;
4870 Py_ssize_t len = self->length;
4871 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872 PyObject *str;
4873
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004874 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 if (Py_UNICODE_MATCH(self, i, substring)) {
4876 if (maxcount-- <= 0)
4877 break;
4878 SPLIT_APPEND(self->str, j, i);
4879 i = j = i + sublen;
4880 } else
4881 i++;
4882 }
4883 if (j <= len) {
4884 SPLIT_APPEND(self->str, j, len);
4885 }
4886 return list;
4887
4888 onError:
4889 Py_DECREF(list);
4890 return NULL;
4891}
4892
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004893static
4894PyObject *rsplit_whitespace(PyUnicodeObject *self,
4895 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004896 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004897{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004898 register Py_ssize_t i;
4899 register Py_ssize_t j;
4900 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004901 PyObject *str;
4902
4903 for (i = j = len - 1; i >= 0; ) {
4904 /* find a token */
4905 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4906 i--;
4907 j = i;
4908 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4909 i--;
4910 if (j > i) {
4911 if (maxcount-- <= 0)
4912 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004913 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004914 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4915 i--;
4916 j = i;
4917 }
4918 }
4919 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004920 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004921 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004922 if (PyList_Reverse(list) < 0)
4923 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004924 return list;
4925
4926 onError:
4927 Py_DECREF(list);
4928 return NULL;
4929}
4930
4931static
4932PyObject *rsplit_char(PyUnicodeObject *self,
4933 PyObject *list,
4934 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004935 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004936{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004937 register Py_ssize_t i;
4938 register Py_ssize_t j;
4939 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004940 PyObject *str;
4941
4942 for (i = j = len - 1; i >= 0; ) {
4943 if (self->str[i] == ch) {
4944 if (maxcount-- <= 0)
4945 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004946 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004947 j = i = i - 1;
4948 } else
4949 i--;
4950 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004951 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004952 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004953 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004954 if (PyList_Reverse(list) < 0)
4955 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004956 return list;
4957
4958 onError:
4959 Py_DECREF(list);
4960 return NULL;
4961}
4962
4963static
4964PyObject *rsplit_substring(PyUnicodeObject *self,
4965 PyObject *list,
4966 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004967 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004968{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004969 register Py_ssize_t i;
4970 register Py_ssize_t j;
4971 Py_ssize_t len = self->length;
4972 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004973 PyObject *str;
4974
4975 for (i = len - sublen, j = len; i >= 0; ) {
4976 if (Py_UNICODE_MATCH(self, i, substring)) {
4977 if (maxcount-- <= 0)
4978 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004979 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004980 j = i;
4981 i -= sublen;
4982 } else
4983 i--;
4984 }
4985 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004986 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004987 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004988 if (PyList_Reverse(list) < 0)
4989 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004990 return list;
4991
4992 onError:
4993 Py_DECREF(list);
4994 return NULL;
4995}
4996
Guido van Rossumd57fd912000-03-10 22:53:23 +00004997#undef SPLIT_APPEND
4998
4999static
5000PyObject *split(PyUnicodeObject *self,
5001 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005002 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005003{
5004 PyObject *list;
5005
5006 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005007 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005008
5009 list = PyList_New(0);
5010 if (!list)
5011 return NULL;
5012
5013 if (substring == NULL)
5014 return split_whitespace(self,list,maxcount);
5015
5016 else if (substring->length == 1)
5017 return split_char(self,list,substring->str[0],maxcount);
5018
5019 else if (substring->length == 0) {
5020 Py_DECREF(list);
5021 PyErr_SetString(PyExc_ValueError, "empty separator");
5022 return NULL;
5023 }
5024 else
5025 return split_substring(self,list,substring,maxcount);
5026}
5027
Tim Petersced69f82003-09-16 20:30:58 +00005028static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005029PyObject *rsplit(PyUnicodeObject *self,
5030 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005031 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005032{
5033 PyObject *list;
5034
5035 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005036 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005037
5038 list = PyList_New(0);
5039 if (!list)
5040 return NULL;
5041
5042 if (substring == NULL)
5043 return rsplit_whitespace(self,list,maxcount);
5044
5045 else if (substring->length == 1)
5046 return rsplit_char(self,list,substring->str[0],maxcount);
5047
5048 else if (substring->length == 0) {
5049 Py_DECREF(list);
5050 PyErr_SetString(PyExc_ValueError, "empty separator");
5051 return NULL;
5052 }
5053 else
5054 return rsplit_substring(self,list,substring,maxcount);
5055}
5056
5057static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058PyObject *replace(PyUnicodeObject *self,
5059 PyUnicodeObject *str1,
5060 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005061 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062{
5063 PyUnicodeObject *u;
5064
5065 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005066 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067
Fredrik Lundh347ee272006-05-24 16:35:18 +00005068 if (str1->length == str2->length) {
5069 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005070 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005071 if (str1->length == 1) {
5072 /* replace characters */
5073 Py_UNICODE u1, u2;
5074 if (!findchar(self->str, self->length, str1->str[0]))
5075 goto nothing;
5076 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5077 if (!u)
5078 return NULL;
5079 Py_UNICODE_COPY(u->str, self->str, self->length);
5080 u1 = str1->str[0];
5081 u2 = str2->str[0];
5082 for (i = 0; i < u->length; i++)
5083 if (u->str[i] == u1) {
5084 if (--maxcount < 0)
5085 break;
5086 u->str[i] = u2;
5087 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005088 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005089 i = fastsearch(
5090 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005092 if (i < 0)
5093 goto nothing;
5094 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5095 if (!u)
5096 return NULL;
5097 Py_UNICODE_COPY(u->str, self->str, self->length);
5098 while (i <= self->length - str1->length)
5099 if (Py_UNICODE_MATCH(self, i, str1)) {
5100 if (--maxcount < 0)
5101 break;
5102 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5103 i += str1->length;
5104 } else
5105 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005108
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005109 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005110 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111 Py_UNICODE *p;
5112
5113 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005114 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115 if (n > maxcount)
5116 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005117 if (n == 0)
5118 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005119 /* new_size = self->length + n * (str2->length - str1->length)); */
5120 delta = (str2->length - str1->length);
5121 if (delta == 0) {
5122 new_size = self->length;
5123 } else {
5124 product = n * (str2->length - str1->length);
5125 if ((product / (str2->length - str1->length)) != n) {
5126 PyErr_SetString(PyExc_OverflowError,
5127 "replace string is too long");
5128 return NULL;
5129 }
5130 new_size = self->length + product;
5131 if (new_size < 0) {
5132 PyErr_SetString(PyExc_OverflowError,
5133 "replace string is too long");
5134 return NULL;
5135 }
5136 }
5137 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005138 if (!u)
5139 return NULL;
5140 i = 0;
5141 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005142 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005143 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005144 while (n-- > 0) {
5145 /* look for next match */
5146 j = i;
5147 while (j <= e) {
5148 if (Py_UNICODE_MATCH(self, j, str1))
5149 break;
5150 j++;
5151 }
5152 if (j > i) {
5153 if (j > e)
5154 break;
5155 /* copy unchanged part [i:j] */
5156 Py_UNICODE_COPY(p, self->str+i, j-i);
5157 p += j - i;
5158 }
5159 /* copy substitution string */
5160 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005161 Py_UNICODE_COPY(p, str2->str, str2->length);
5162 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005163 }
5164 i = j + str1->length;
5165 }
5166 if (i < self->length)
5167 /* copy tail [i:] */
5168 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005169 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005170 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005171 while (n > 0) {
5172 Py_UNICODE_COPY(p, str2->str, str2->length);
5173 p += str2->length;
5174 if (--n <= 0)
5175 break;
5176 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005178 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 }
5180 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005182
5183nothing:
5184 /* nothing to replace; return original string (when possible) */
5185 if (PyUnicode_CheckExact(self)) {
5186 Py_INCREF(self);
5187 return (PyObject *) self;
5188 }
5189 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190}
5191
5192/* --- Unicode Object Methods --------------------------------------------- */
5193
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005194PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195"S.title() -> unicode\n\
5196\n\
5197Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005198characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199
5200static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005201unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203 return fixup(self, fixtitle);
5204}
5205
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005206PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207"S.capitalize() -> unicode\n\
5208\n\
5209Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005210have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211
5212static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005213unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215 return fixup(self, fixcapitalize);
5216}
5217
5218#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005219PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220"S.capwords() -> unicode\n\
5221\n\
5222Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005223normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224
5225static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005226unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227{
5228 PyObject *list;
5229 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005230 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232 /* Split into words */
5233 list = split(self, NULL, -1);
5234 if (!list)
5235 return NULL;
5236
5237 /* Capitalize each word */
5238 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5239 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5240 fixcapitalize);
5241 if (item == NULL)
5242 goto onError;
5243 Py_DECREF(PyList_GET_ITEM(list, i));
5244 PyList_SET_ITEM(list, i, item);
5245 }
5246
5247 /* Join the words to form a new string */
5248 item = PyUnicode_Join(NULL, list);
5249
5250onError:
5251 Py_DECREF(list);
5252 return (PyObject *)item;
5253}
5254#endif
5255
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005256/* Argument converter. Coerces to a single unicode character */
5257
5258static int
5259convert_uc(PyObject *obj, void *addr)
5260{
5261 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5262 PyObject *uniobj;
5263 Py_UNICODE *unistr;
5264
5265 uniobj = PyUnicode_FromObject(obj);
5266 if (uniobj == NULL) {
5267 PyErr_SetString(PyExc_TypeError,
5268 "The fill character cannot be converted to Unicode");
5269 return 0;
5270 }
5271 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5272 PyErr_SetString(PyExc_TypeError,
5273 "The fill character must be exactly one character long");
5274 Py_DECREF(uniobj);
5275 return 0;
5276 }
5277 unistr = PyUnicode_AS_UNICODE(uniobj);
5278 *fillcharloc = unistr[0];
5279 Py_DECREF(uniobj);
5280 return 1;
5281}
5282
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005283PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005284"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005286Return S centered in a Unicode string of length width. Padding is\n\
5287done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288
5289static PyObject *
5290unicode_center(PyUnicodeObject *self, PyObject *args)
5291{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005292 Py_ssize_t marg, left;
5293 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005294 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295
Thomas Woutersde017742006-02-16 19:34:37 +00005296 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 return NULL;
5298
Tim Peters7a29bd52001-09-12 03:03:31 +00005299 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300 Py_INCREF(self);
5301 return (PyObject*) self;
5302 }
5303
5304 marg = width - self->length;
5305 left = marg / 2 + (marg & width & 1);
5306
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005307 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308}
5309
Marc-André Lemburge5034372000-08-08 08:04:29 +00005310#if 0
5311
5312/* This code should go into some future Unicode collation support
5313 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005314 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005315
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005316/* speedy UTF-16 code point order comparison */
5317/* gleaned from: */
5318/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5319
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005320static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005321{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005322 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005323 0, 0, 0, 0, 0, 0, 0, 0,
5324 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005325 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005326};
5327
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328static int
5329unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5330{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005331 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005332
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333 Py_UNICODE *s1 = str1->str;
5334 Py_UNICODE *s2 = str2->str;
5335
5336 len1 = str1->length;
5337 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005338
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005340 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005341
5342 c1 = *s1++;
5343 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005344
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005345 if (c1 > (1<<11) * 26)
5346 c1 += utf16Fixup[c1>>11];
5347 if (c2 > (1<<11) * 26)
5348 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005349 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005350
5351 if (c1 != c2)
5352 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005353
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005354 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 }
5356
5357 return (len1 < len2) ? -1 : (len1 != len2);
5358}
5359
Marc-André Lemburge5034372000-08-08 08:04:29 +00005360#else
5361
5362static int
5363unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5364{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005365 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005366
5367 Py_UNICODE *s1 = str1->str;
5368 Py_UNICODE *s2 = str2->str;
5369
5370 len1 = str1->length;
5371 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005372
Marc-André Lemburge5034372000-08-08 08:04:29 +00005373 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005374 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005375
Fredrik Lundh45714e92001-06-26 16:39:36 +00005376 c1 = *s1++;
5377 c2 = *s2++;
5378
5379 if (c1 != c2)
5380 return (c1 < c2) ? -1 : 1;
5381
Marc-André Lemburge5034372000-08-08 08:04:29 +00005382 len1--; len2--;
5383 }
5384
5385 return (len1 < len2) ? -1 : (len1 != len2);
5386}
5387
5388#endif
5389
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390int PyUnicode_Compare(PyObject *left,
5391 PyObject *right)
5392{
5393 PyUnicodeObject *u = NULL, *v = NULL;
5394 int result;
5395
5396 /* Coerce the two arguments */
5397 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5398 if (u == NULL)
5399 goto onError;
5400 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5401 if (v == NULL)
5402 goto onError;
5403
Thomas Wouters7e474022000-07-16 12:04:32 +00005404 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 if (v == u) {
5406 Py_DECREF(u);
5407 Py_DECREF(v);
5408 return 0;
5409 }
5410
5411 result = unicode_compare(u, v);
5412
5413 Py_DECREF(u);
5414 Py_DECREF(v);
5415 return result;
5416
5417onError:
5418 Py_XDECREF(u);
5419 Py_XDECREF(v);
5420 return -1;
5421}
5422
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00005423PyObject *PyUnicode_RichCompare(PyObject *left,
5424 PyObject *right,
5425 int op)
5426{
5427 int result;
5428
5429 result = PyUnicode_Compare(left, right);
5430 if (result == -1 && PyErr_Occurred())
5431 goto onError;
5432
5433 /* Convert the return value to a Boolean */
5434 switch (op) {
5435 case Py_EQ:
5436 result = (result == 0);
5437 break;
5438 case Py_NE:
5439 result = (result != 0);
5440 break;
5441 case Py_LE:
5442 result = (result <= 0);
5443 break;
5444 case Py_GE:
5445 result = (result >= 0);
5446 break;
5447 case Py_LT:
5448 result = (result == -1);
5449 break;
5450 case Py_GT:
5451 result = (result == 1);
5452 break;
5453 }
5454 return PyBool_FromLong(result);
5455
5456 onError:
5457
5458 /* Standard case
5459
5460 Type errors mean that PyUnicode_FromObject() could not convert
5461 one of the arguments (usually the right hand side) to Unicode,
5462 ie. we can't handle the comparison request. However, it is
5463 possible that the other object knows a comparison method, which
5464 is why we return Py_NotImplemented to give the other object a
5465 chance.
5466
5467 */
5468 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5469 PyErr_Clear();
5470 Py_INCREF(Py_NotImplemented);
5471 return Py_NotImplemented;
5472 }
5473 if (op != Py_EQ && op != Py_NE)
5474 return NULL;
5475
5476 /* Equality comparison.
5477
5478 This is a special case: we silence any PyExc_UnicodeDecodeError
5479 and instead turn it into a PyErr_UnicodeWarning.
5480
5481 */
5482 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5483 return NULL;
5484 PyErr_Clear();
5485 if (PyErr_Warn(PyExc_UnicodeWarning,
5486 (op == Py_EQ) ?
5487 "Unicode equal comparison "
5488 "failed to convert both arguments to Unicode - "
5489 "interpreting them as being unequal" :
5490 "Unicode unequal comparison "
5491 "failed to convert both arguments to Unicode - "
5492 "interpreting them as being unequal"
5493 ) < 0)
5494 return NULL;
5495 result = (op == Py_NE);
5496 return PyBool_FromLong(result);
5497}
5498
Guido van Rossum403d68b2000-03-13 15:55:09 +00005499int PyUnicode_Contains(PyObject *container,
5500 PyObject *element)
5501{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005502 PyObject *str, *sub;
5503 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005504
5505 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005506 sub = PyUnicode_FromObject(element);
5507 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005508 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005509 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005510 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005511 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005512
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005513 str = PyUnicode_FromObject(container);
5514 if (!str) {
5515 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005516 return -1;
5517 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005518
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005519 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00005520
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005521 Py_DECREF(str);
5522 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00005523
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005524 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005525}
5526
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527/* Concat to string or Unicode object giving a new Unicode object. */
5528
5529PyObject *PyUnicode_Concat(PyObject *left,
5530 PyObject *right)
5531{
5532 PyUnicodeObject *u = NULL, *v = NULL, *w;
5533
5534 /* Coerce the two arguments */
5535 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5536 if (u == NULL)
5537 goto onError;
5538 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5539 if (v == NULL)
5540 goto onError;
5541
5542 /* Shortcuts */
5543 if (v == unicode_empty) {
5544 Py_DECREF(v);
5545 return (PyObject *)u;
5546 }
5547 if (u == unicode_empty) {
5548 Py_DECREF(u);
5549 return (PyObject *)v;
5550 }
5551
5552 /* Concat the two Unicode strings */
5553 w = _PyUnicode_New(u->length + v->length);
5554 if (w == NULL)
5555 goto onError;
5556 Py_UNICODE_COPY(w->str, u->str, u->length);
5557 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5558
5559 Py_DECREF(u);
5560 Py_DECREF(v);
5561 return (PyObject *)w;
5562
5563onError:
5564 Py_XDECREF(u);
5565 Py_XDECREF(v);
5566 return NULL;
5567}
5568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005569PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570"S.count(sub[, start[, end]]) -> int\n\
5571\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005572Return the number of non-overlapping occurrences of substring sub in\n\
5573Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005574interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575
5576static PyObject *
5577unicode_count(PyUnicodeObject *self, PyObject *args)
5578{
5579 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005580 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005581 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 PyObject *result;
5583
Guido van Rossumb8872e62000-05-09 14:14:27 +00005584 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5585 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 return NULL;
5587
5588 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005589 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 if (substring == NULL)
5591 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005592
Fredrik Lundhc8162812006-05-26 19:33:03 +00005593 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005595 result = PyInt_FromSsize_t(
5596 stringlib_count(self->str + start, end - start,
5597 substring->str, substring->length)
5598 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599
5600 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005601
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602 return result;
5603}
5604
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005605PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005606"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005608Encodes S using the codec registered for encoding. encoding defaults\n\
5609to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005610handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005611a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5612'xmlcharrefreplace' as well as any other name registered with\n\
5613codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614
5615static PyObject *
5616unicode_encode(PyUnicodeObject *self, PyObject *args)
5617{
5618 char *encoding = NULL;
5619 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005620 PyObject *v;
5621
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5623 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005624 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005625 if (v == NULL)
5626 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005627 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5628 PyErr_Format(PyExc_TypeError,
5629 "encoder did not return a string/unicode object "
5630 "(type=%.400s)",
5631 v->ob_type->tp_name);
5632 Py_DECREF(v);
5633 return NULL;
5634 }
5635 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005636
5637 onError:
5638 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005639}
5640
5641PyDoc_STRVAR(decode__doc__,
5642"S.decode([encoding[,errors]]) -> string or unicode\n\
5643\n\
5644Decodes S using the codec registered for encoding. encoding defaults\n\
5645to the default encoding. errors may be given to set a different error\n\
5646handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5647a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5648as well as any other name registerd with codecs.register_error that is\n\
5649able to handle UnicodeDecodeErrors.");
5650
5651static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005652unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005653{
5654 char *encoding = NULL;
5655 char *errors = NULL;
5656 PyObject *v;
5657
5658 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5659 return NULL;
5660 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005661 if (v == NULL)
5662 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005663 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5664 PyErr_Format(PyExc_TypeError,
5665 "decoder did not return a string/unicode object "
5666 "(type=%.400s)",
5667 v->ob_type->tp_name);
5668 Py_DECREF(v);
5669 return NULL;
5670 }
5671 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005672
5673 onError:
5674 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675}
5676
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005677PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678"S.expandtabs([tabsize]) -> unicode\n\
5679\n\
5680Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005681If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682
5683static PyObject*
5684unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5685{
5686 Py_UNICODE *e;
5687 Py_UNICODE *p;
5688 Py_UNICODE *q;
Neal Norwitz66e64e22007-06-09 04:06:30 +00005689 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 PyUnicodeObject *u;
5691 int tabsize = 8;
5692
5693 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5694 return NULL;
5695
Thomas Wouters7e474022000-07-16 12:04:32 +00005696 /* First pass: determine size of output string */
Neal Norwitz66e64e22007-06-09 04:06:30 +00005697 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 e = self->str + self->length;
5699 for (p = self->str; p < e; p++)
5700 if (*p == '\t') {
Neal Norwitz66e64e22007-06-09 04:06:30 +00005701 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 j += tabsize - (j % tabsize);
Neal Norwitz66e64e22007-06-09 04:06:30 +00005703 if (old_j > j) {
Neal Norwitz8355dd52007-06-11 04:32:41 +00005704 PyErr_SetString(PyExc_OverflowError,
5705 "new string is too long");
Neal Norwitz66e64e22007-06-09 04:06:30 +00005706 return NULL;
5707 }
5708 old_j = j;
5709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 }
5711 else {
5712 j++;
5713 if (*p == '\n' || *p == '\r') {
5714 i += j;
Neal Norwitz8355dd52007-06-11 04:32:41 +00005715 old_j = j = 0;
5716 if (i < 0) {
5717 PyErr_SetString(PyExc_OverflowError,
5718 "new string is too long");
5719 return NULL;
5720 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 }
5722 }
5723
Neal Norwitz66e64e22007-06-09 04:06:30 +00005724 if ((i + j) < 0) {
5725 PyErr_SetString(PyExc_OverflowError, "new string is too long");
5726 return NULL;
5727 }
5728
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729 /* Second pass: create output string and fill it */
5730 u = _PyUnicode_New(i + j);
5731 if (!u)
5732 return NULL;
5733
5734 j = 0;
5735 q = u->str;
5736
5737 for (p = self->str; p < e; p++)
5738 if (*p == '\t') {
5739 if (tabsize > 0) {
5740 i = tabsize - (j % tabsize);
5741 j += i;
5742 while (i--)
5743 *q++ = ' ';
5744 }
5745 }
5746 else {
5747 j++;
5748 *q++ = *p;
5749 if (*p == '\n' || *p == '\r')
5750 j = 0;
5751 }
5752
5753 return (PyObject*) u;
5754}
5755
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005756PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757"S.find(sub [,start [,end]]) -> int\n\
5758\n\
5759Return the lowest index in S where substring sub is found,\n\
Georg Brandlb4d100c2007-07-29 17:37:22 +00005760such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761arguments start and end are interpreted as in slice notation.\n\
5762\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005763Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764
5765static PyObject *
5766unicode_find(PyUnicodeObject *self, PyObject *args)
5767{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005768 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005769 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005770 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005771 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772
Guido van Rossumb8872e62000-05-09 14:14:27 +00005773 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5774 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005776 substring = PyUnicode_FromObject(substring);
5777 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 return NULL;
5779
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005780 result = stringlib_find_slice(
5781 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5782 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5783 start, end
5784 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785
5786 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005787
5788 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789}
5790
5791static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005792unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793{
5794 if (index < 0 || index >= self->length) {
5795 PyErr_SetString(PyExc_IndexError, "string index out of range");
5796 return NULL;
5797 }
5798
5799 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5800}
5801
5802static long
5803unicode_hash(PyUnicodeObject *self)
5804{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005805 /* Since Unicode objects compare equal to their ASCII string
5806 counterparts, they should use the individual character values
5807 as basis for their hash value. This is needed to assure that
5808 strings and Unicode objects behave in the same way as
5809 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810
Martin v. Löwis18e16552006-02-15 17:27:45 +00005811 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005812 register Py_UNICODE *p;
5813 register long x;
5814
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 if (self->hash != -1)
5816 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005817 len = PyUnicode_GET_SIZE(self);
5818 p = PyUnicode_AS_UNICODE(self);
5819 x = *p << 7;
5820 while (--len >= 0)
5821 x = (1000003*x) ^ *p++;
5822 x ^= PyUnicode_GET_SIZE(self);
5823 if (x == -1)
5824 x = -2;
5825 self->hash = x;
5826 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827}
5828
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005829PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830"S.index(sub [,start [,end]]) -> int\n\
5831\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005832Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833
5834static PyObject *
5835unicode_index(PyUnicodeObject *self, PyObject *args)
5836{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005837 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005838 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005839 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005840 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841
Guido van Rossumb8872e62000-05-09 14:14:27 +00005842 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5843 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005845 substring = PyUnicode_FromObject(substring);
5846 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 return NULL;
5848
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005849 result = stringlib_find_slice(
5850 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5851 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5852 start, end
5853 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854
5855 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005856
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857 if (result < 0) {
5858 PyErr_SetString(PyExc_ValueError, "substring not found");
5859 return NULL;
5860 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005861
Martin v. Löwis18e16552006-02-15 17:27:45 +00005862 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863}
5864
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005865PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005866"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005868Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005869at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870
5871static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005872unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873{
5874 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5875 register const Py_UNICODE *e;
5876 int cased;
5877
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 /* Shortcut for single character strings */
5879 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005880 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005882 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005883 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005884 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005885
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886 e = p + PyUnicode_GET_SIZE(self);
5887 cased = 0;
5888 for (; p < e; p++) {
5889 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005890
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005892 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 else if (!cased && Py_UNICODE_ISLOWER(ch))
5894 cased = 1;
5895 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005896 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897}
5898
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005899PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005900"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005902Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005903at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904
5905static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005906unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907{
5908 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5909 register const Py_UNICODE *e;
5910 int cased;
5911
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 /* Shortcut for single character strings */
5913 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005914 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005916 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005917 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005918 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005919
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 e = p + PyUnicode_GET_SIZE(self);
5921 cased = 0;
5922 for (; p < e; p++) {
5923 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005924
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005926 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 else if (!cased && Py_UNICODE_ISUPPER(ch))
5928 cased = 1;
5929 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005930 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931}
5932
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005933PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005934"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005936Return True if S is a titlecased string and there is at least one\n\
5937character in S, i.e. upper- and titlecase characters may only\n\
5938follow uncased characters and lowercase characters only cased ones.\n\
5939Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940
5941static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005942unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943{
5944 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5945 register const Py_UNICODE *e;
5946 int cased, previous_is_cased;
5947
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 /* Shortcut for single character strings */
5949 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005950 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5951 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005953 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005954 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005955 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005956
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 e = p + PyUnicode_GET_SIZE(self);
5958 cased = 0;
5959 previous_is_cased = 0;
5960 for (; p < e; p++) {
5961 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005962
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5964 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005965 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 previous_is_cased = 1;
5967 cased = 1;
5968 }
5969 else if (Py_UNICODE_ISLOWER(ch)) {
5970 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005971 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 previous_is_cased = 1;
5973 cased = 1;
5974 }
5975 else
5976 previous_is_cased = 0;
5977 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005978 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979}
5980
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005981PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005982"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005984Return True if all characters in S are whitespace\n\
5985and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986
5987static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005988unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989{
5990 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5991 register const Py_UNICODE *e;
5992
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 /* Shortcut for single character strings */
5994 if (PyUnicode_GET_SIZE(self) == 1 &&
5995 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005996 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005998 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005999 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006000 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006001
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 e = p + PyUnicode_GET_SIZE(self);
6003 for (; p < e; p++) {
6004 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006005 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006007 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008}
6009
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006010PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006011"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006012\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006013Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006014and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006015
6016static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006017unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006018{
6019 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6020 register const Py_UNICODE *e;
6021
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006022 /* Shortcut for single character strings */
6023 if (PyUnicode_GET_SIZE(self) == 1 &&
6024 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006025 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006026
6027 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006028 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006029 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006030
6031 e = p + PyUnicode_GET_SIZE(self);
6032 for (; p < e; p++) {
6033 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006034 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006035 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006036 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006037}
6038
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006039PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006040"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006041\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006042Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006043and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006044
6045static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006046unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006047{
6048 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6049 register const Py_UNICODE *e;
6050
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006051 /* Shortcut for single character strings */
6052 if (PyUnicode_GET_SIZE(self) == 1 &&
6053 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006054 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006055
6056 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006057 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006058 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006059
6060 e = p + PyUnicode_GET_SIZE(self);
6061 for (; p < e; p++) {
6062 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006063 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006064 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006065 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006066}
6067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006068PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006069"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006071Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006072False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073
6074static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006075unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076{
6077 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6078 register const Py_UNICODE *e;
6079
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 /* Shortcut for single character strings */
6081 if (PyUnicode_GET_SIZE(self) == 1 &&
6082 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006083 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006085 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006086 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006087 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006088
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 e = p + PyUnicode_GET_SIZE(self);
6090 for (; p < e; p++) {
6091 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006092 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006094 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095}
6096
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006097PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006098"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006100Return True if all characters in S are digits\n\
6101and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102
6103static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006104unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105{
6106 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6107 register const Py_UNICODE *e;
6108
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 /* Shortcut for single character strings */
6110 if (PyUnicode_GET_SIZE(self) == 1 &&
6111 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006112 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006114 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006115 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006116 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006117
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 e = p + PyUnicode_GET_SIZE(self);
6119 for (; p < e; p++) {
6120 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006121 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006123 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124}
6125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006126PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006127"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006129Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006130False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131
6132static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006133unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134{
6135 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6136 register const Py_UNICODE *e;
6137
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 /* Shortcut for single character strings */
6139 if (PyUnicode_GET_SIZE(self) == 1 &&
6140 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006141 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006143 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006144 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006145 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006146
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147 e = p + PyUnicode_GET_SIZE(self);
6148 for (; p < e; p++) {
6149 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006150 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006152 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153}
6154
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006155PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156"S.join(sequence) -> unicode\n\
6157\n\
6158Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006159sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160
6161static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006162unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006164 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165}
6166
Martin v. Löwis18e16552006-02-15 17:27:45 +00006167static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168unicode_length(PyUnicodeObject *self)
6169{
6170 return self->length;
6171}
6172
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006173PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006174"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175\n\
6176Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006177done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178
6179static PyObject *
6180unicode_ljust(PyUnicodeObject *self, PyObject *args)
6181{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006182 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006183 Py_UNICODE fillchar = ' ';
6184
Martin v. Löwis412fb672006-04-13 06:34:32 +00006185 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 return NULL;
6187
Tim Peters7a29bd52001-09-12 03:03:31 +00006188 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189 Py_INCREF(self);
6190 return (PyObject*) self;
6191 }
6192
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006193 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194}
6195
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006196PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197"S.lower() -> unicode\n\
6198\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006199Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200
6201static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006202unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 return fixup(self, fixlower);
6205}
6206
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006207#define LEFTSTRIP 0
6208#define RIGHTSTRIP 1
6209#define BOTHSTRIP 2
6210
6211/* Arrays indexed by above */
6212static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6213
6214#define STRIPNAME(i) (stripformat[i]+3)
6215
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006216/* externally visible for str.strip(unicode) */
6217PyObject *
6218_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6219{
6220 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006221 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006222 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006223 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6224 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006225
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006226 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6227
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006228 i = 0;
6229 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006230 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6231 i++;
6232 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006233 }
6234
6235 j = len;
6236 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006237 do {
6238 j--;
6239 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6240 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006241 }
6242
6243 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006244 Py_INCREF(self);
6245 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006246 }
6247 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006248 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006249}
6250
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251
6252static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006253do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006255 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006256 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006257
6258 i = 0;
6259 if (striptype != RIGHTSTRIP) {
6260 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6261 i++;
6262 }
6263 }
6264
6265 j = len;
6266 if (striptype != LEFTSTRIP) {
6267 do {
6268 j--;
6269 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6270 j++;
6271 }
6272
6273 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6274 Py_INCREF(self);
6275 return (PyObject*)self;
6276 }
6277 else
6278 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279}
6280
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006281
6282static PyObject *
6283do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6284{
6285 PyObject *sep = NULL;
6286
6287 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6288 return NULL;
6289
6290 if (sep != NULL && sep != Py_None) {
6291 if (PyUnicode_Check(sep))
6292 return _PyUnicode_XStrip(self, striptype, sep);
6293 else if (PyString_Check(sep)) {
6294 PyObject *res;
6295 sep = PyUnicode_FromObject(sep);
6296 if (sep==NULL)
6297 return NULL;
6298 res = _PyUnicode_XStrip(self, striptype, sep);
6299 Py_DECREF(sep);
6300 return res;
6301 }
6302 else {
6303 PyErr_Format(PyExc_TypeError,
6304 "%s arg must be None, unicode or str",
6305 STRIPNAME(striptype));
6306 return NULL;
6307 }
6308 }
6309
6310 return do_strip(self, striptype);
6311}
6312
6313
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006314PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006315"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006316\n\
6317Return a copy of the string S with leading and trailing\n\
6318whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006319If chars is given and not None, remove characters in chars instead.\n\
6320If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006321
6322static PyObject *
6323unicode_strip(PyUnicodeObject *self, PyObject *args)
6324{
6325 if (PyTuple_GET_SIZE(args) == 0)
6326 return do_strip(self, BOTHSTRIP); /* Common case */
6327 else
6328 return do_argstrip(self, BOTHSTRIP, args);
6329}
6330
6331
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006332PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006333"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006334\n\
6335Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006336If chars is given and not None, remove characters in chars instead.\n\
6337If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006338
6339static PyObject *
6340unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6341{
6342 if (PyTuple_GET_SIZE(args) == 0)
6343 return do_strip(self, LEFTSTRIP); /* Common case */
6344 else
6345 return do_argstrip(self, LEFTSTRIP, args);
6346}
6347
6348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006349PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006350"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006351\n\
6352Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006353If chars is given and not None, remove characters in chars instead.\n\
6354If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006355
6356static PyObject *
6357unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6358{
6359 if (PyTuple_GET_SIZE(args) == 0)
6360 return do_strip(self, RIGHTSTRIP); /* Common case */
6361 else
6362 return do_argstrip(self, RIGHTSTRIP, args);
6363}
6364
6365
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006367unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368{
6369 PyUnicodeObject *u;
6370 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006371 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006372 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373
6374 if (len < 0)
6375 len = 0;
6376
Tim Peters7a29bd52001-09-12 03:03:31 +00006377 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 /* no repeat, return original string */
6379 Py_INCREF(str);
6380 return (PyObject*) str;
6381 }
Tim Peters8f422462000-09-09 06:13:41 +00006382
6383 /* ensure # of chars needed doesn't overflow int and # of bytes
6384 * needed doesn't overflow size_t
6385 */
6386 nchars = len * str->length;
6387 if (len && nchars / len != str->length) {
6388 PyErr_SetString(PyExc_OverflowError,
6389 "repeated string is too long");
6390 return NULL;
6391 }
6392 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6393 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6394 PyErr_SetString(PyExc_OverflowError,
6395 "repeated string is too long");
6396 return NULL;
6397 }
6398 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 if (!u)
6400 return NULL;
6401
6402 p = u->str;
6403
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006404 if (str->length == 1 && len > 0) {
6405 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006406 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006407 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006408 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006409 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006410 done = str->length;
6411 }
6412 while (done < nchars) {
6413 int n = (done <= nchars-done) ? done : nchars-done;
6414 Py_UNICODE_COPY(p+done, p, n);
6415 done += n;
6416 }
6417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418
6419 return (PyObject*) u;
6420}
6421
6422PyObject *PyUnicode_Replace(PyObject *obj,
6423 PyObject *subobj,
6424 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006425 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426{
6427 PyObject *self;
6428 PyObject *str1;
6429 PyObject *str2;
6430 PyObject *result;
6431
6432 self = PyUnicode_FromObject(obj);
6433 if (self == NULL)
6434 return NULL;
6435 str1 = PyUnicode_FromObject(subobj);
6436 if (str1 == NULL) {
6437 Py_DECREF(self);
6438 return NULL;
6439 }
6440 str2 = PyUnicode_FromObject(replobj);
6441 if (str2 == NULL) {
6442 Py_DECREF(self);
6443 Py_DECREF(str1);
6444 return NULL;
6445 }
Tim Petersced69f82003-09-16 20:30:58 +00006446 result = replace((PyUnicodeObject *)self,
6447 (PyUnicodeObject *)str1,
6448 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449 maxcount);
6450 Py_DECREF(self);
6451 Py_DECREF(str1);
6452 Py_DECREF(str2);
6453 return result;
6454}
6455
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006456PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457"S.replace (old, new[, maxsplit]) -> unicode\n\
6458\n\
6459Return a copy of S with all occurrences of substring\n\
6460old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006461given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462
6463static PyObject*
6464unicode_replace(PyUnicodeObject *self, PyObject *args)
6465{
6466 PyUnicodeObject *str1;
6467 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006468 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 PyObject *result;
6470
Martin v. Löwis18e16552006-02-15 17:27:45 +00006471 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 return NULL;
6473 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6474 if (str1 == NULL)
6475 return NULL;
6476 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006477 if (str2 == NULL) {
6478 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006480 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481
6482 result = replace(self, str1, str2, maxcount);
6483
6484 Py_DECREF(str1);
6485 Py_DECREF(str2);
6486 return result;
6487}
6488
6489static
6490PyObject *unicode_repr(PyObject *unicode)
6491{
6492 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6493 PyUnicode_GET_SIZE(unicode),
6494 1);
6495}
6496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006497PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498"S.rfind(sub [,start [,end]]) -> int\n\
6499\n\
6500Return the highest index in S where substring sub is found,\n\
Georg Brandlb4d100c2007-07-29 17:37:22 +00006501such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502arguments start and end are interpreted as in slice notation.\n\
6503\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006504Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505
6506static PyObject *
6507unicode_rfind(PyUnicodeObject *self, PyObject *args)
6508{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006509 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006510 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006511 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006512 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513
Guido van Rossumb8872e62000-05-09 14:14:27 +00006514 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6515 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006517 substring = PyUnicode_FromObject(substring);
6518 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 return NULL;
6520
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006521 result = stringlib_rfind_slice(
6522 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6523 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6524 start, end
6525 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526
6527 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006528
6529 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530}
6531
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006532PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533"S.rindex(sub [,start [,end]]) -> int\n\
6534\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006535Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536
6537static PyObject *
6538unicode_rindex(PyUnicodeObject *self, PyObject *args)
6539{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006540 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006541 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006542 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006543 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544
Guido van Rossumb8872e62000-05-09 14:14:27 +00006545 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6546 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006548 substring = PyUnicode_FromObject(substring);
6549 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 return NULL;
6551
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006552 result = stringlib_rfind_slice(
6553 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6554 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6555 start, end
6556 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557
6558 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006559
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560 if (result < 0) {
6561 PyErr_SetString(PyExc_ValueError, "substring not found");
6562 return NULL;
6563 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006564 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565}
6566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006567PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006568"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569\n\
6570Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006571done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572
6573static PyObject *
6574unicode_rjust(PyUnicodeObject *self, PyObject *args)
6575{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006576 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006577 Py_UNICODE fillchar = ' ';
6578
Martin v. Löwis412fb672006-04-13 06:34:32 +00006579 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 return NULL;
6581
Tim Peters7a29bd52001-09-12 03:03:31 +00006582 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583 Py_INCREF(self);
6584 return (PyObject*) self;
6585 }
6586
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006587 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588}
6589
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006591unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592{
6593 /* standard clamping */
6594 if (start < 0)
6595 start = 0;
6596 if (end < 0)
6597 end = 0;
6598 if (end > self->length)
6599 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006600 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601 /* full slice, return original string */
6602 Py_INCREF(self);
6603 return (PyObject*) self;
6604 }
6605 if (start > end)
6606 start = end;
6607 /* copy slice */
6608 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6609 end - start);
6610}
6611
6612PyObject *PyUnicode_Split(PyObject *s,
6613 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006614 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615{
6616 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006617
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618 s = PyUnicode_FromObject(s);
6619 if (s == NULL)
6620 return NULL;
6621 if (sep != NULL) {
6622 sep = PyUnicode_FromObject(sep);
6623 if (sep == NULL) {
6624 Py_DECREF(s);
6625 return NULL;
6626 }
6627 }
6628
6629 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6630
6631 Py_DECREF(s);
6632 Py_XDECREF(sep);
6633 return result;
6634}
6635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006636PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637"S.split([sep [,maxsplit]]) -> list of strings\n\
6638\n\
6639Return a list of the words in S, using sep as the\n\
6640delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006641splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006642any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643
6644static PyObject*
6645unicode_split(PyUnicodeObject *self, PyObject *args)
6646{
6647 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006648 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649
Martin v. Löwis18e16552006-02-15 17:27:45 +00006650 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 return NULL;
6652
6653 if (substring == Py_None)
6654 return split(self, NULL, maxcount);
6655 else if (PyUnicode_Check(substring))
6656 return split(self, (PyUnicodeObject *)substring, maxcount);
6657 else
6658 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6659}
6660
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006661PyObject *
6662PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6663{
6664 PyObject* str_obj;
6665 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006666 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00006667
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006668 str_obj = PyUnicode_FromObject(str_in);
6669 if (!str_obj)
6670 return NULL;
6671 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00006672 if (!sep_obj) {
6673 Py_DECREF(str_obj);
6674 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006675 }
6676
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006677 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00006678 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6679 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6680 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006681
Fredrik Lundhb9479482006-05-26 17:22:38 +00006682 Py_DECREF(sep_obj);
6683 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006684
6685 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006686}
6687
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006688
6689PyObject *
6690PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6691{
6692 PyObject* str_obj;
6693 PyObject* sep_obj;
6694 PyObject* out;
6695
6696 str_obj = PyUnicode_FromObject(str_in);
6697 if (!str_obj)
6698 return NULL;
6699 sep_obj = PyUnicode_FromObject(sep_in);
6700 if (!sep_obj) {
6701 Py_DECREF(str_obj);
6702 return NULL;
6703 }
6704
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006705 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006706 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6707 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6708 );
6709
6710 Py_DECREF(sep_obj);
6711 Py_DECREF(str_obj);
6712
6713 return out;
6714}
6715
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006716PyDoc_STRVAR(partition__doc__,
6717"S.partition(sep) -> (head, sep, tail)\n\
6718\n\
6719Searches for the separator sep in S, and returns the part before it,\n\
6720the separator itself, and the part after it. If the separator is not\n\
6721found, returns S and two empty strings.");
6722
6723static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00006724unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006725{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006726 return PyUnicode_Partition((PyObject *)self, separator);
6727}
6728
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006729PyDoc_STRVAR(rpartition__doc__,
Neal Norwitz29a5fdb2006-09-05 02:21:38 +00006730"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006731\n\
6732Searches for the separator sep in S, starting at the end of S, and returns\n\
6733the part before it, the separator itself, and the part after it. If the\n\
Neal Norwitz29a5fdb2006-09-05 02:21:38 +00006734separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006735
6736static PyObject*
6737unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6738{
6739 return PyUnicode_RPartition((PyObject *)self, separator);
6740}
6741
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006742PyObject *PyUnicode_RSplit(PyObject *s,
6743 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006744 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006745{
6746 PyObject *result;
6747
6748 s = PyUnicode_FromObject(s);
6749 if (s == NULL)
6750 return NULL;
6751 if (sep != NULL) {
6752 sep = PyUnicode_FromObject(sep);
6753 if (sep == NULL) {
6754 Py_DECREF(s);
6755 return NULL;
6756 }
6757 }
6758
6759 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6760
6761 Py_DECREF(s);
6762 Py_XDECREF(sep);
6763 return result;
6764}
6765
6766PyDoc_STRVAR(rsplit__doc__,
6767"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6768\n\
6769Return a list of the words in S, using sep as the\n\
6770delimiter string, starting at the end of the string and\n\
6771working to the front. If maxsplit is given, at most maxsplit\n\
6772splits are done. If sep is not specified, any whitespace string\n\
6773is a separator.");
6774
6775static PyObject*
6776unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6777{
6778 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006779 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006780
Martin v. Löwis18e16552006-02-15 17:27:45 +00006781 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006782 return NULL;
6783
6784 if (substring == Py_None)
6785 return rsplit(self, NULL, maxcount);
6786 else if (PyUnicode_Check(substring))
6787 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6788 else
6789 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6790}
6791
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006792PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006793"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794\n\
6795Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006796Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006797is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798
6799static PyObject*
6800unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6801{
Guido van Rossum86662912000-04-11 15:38:46 +00006802 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803
Guido van Rossum86662912000-04-11 15:38:46 +00006804 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805 return NULL;
6806
Guido van Rossum86662912000-04-11 15:38:46 +00006807 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808}
6809
6810static
6811PyObject *unicode_str(PyUnicodeObject *self)
6812{
Fred Drakee4315f52000-05-09 19:53:39 +00006813 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814}
6815
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006816PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817"S.swapcase() -> unicode\n\
6818\n\
6819Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006820and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821
6822static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006823unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825 return fixup(self, fixswapcase);
6826}
6827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006828PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829"S.translate(table) -> unicode\n\
6830\n\
6831Return a copy of the string S, where all characters have been mapped\n\
6832through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006833Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6834Unmapped characters are left untouched. Characters mapped to None\n\
6835are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836
6837static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006838unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839{
Tim Petersced69f82003-09-16 20:30:58 +00006840 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006842 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 "ignore");
6844}
6845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006846PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847"S.upper() -> unicode\n\
6848\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006849Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850
6851static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006852unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 return fixup(self, fixupper);
6855}
6856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006857PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858"S.zfill(width) -> unicode\n\
6859\n\
6860Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006861of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862
6863static PyObject *
6864unicode_zfill(PyUnicodeObject *self, PyObject *args)
6865{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006866 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 PyUnicodeObject *u;
6868
Martin v. Löwis18e16552006-02-15 17:27:45 +00006869 Py_ssize_t width;
6870 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871 return NULL;
6872
6873 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006874 if (PyUnicode_CheckExact(self)) {
6875 Py_INCREF(self);
6876 return (PyObject*) self;
6877 }
6878 else
6879 return PyUnicode_FromUnicode(
6880 PyUnicode_AS_UNICODE(self),
6881 PyUnicode_GET_SIZE(self)
6882 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883 }
6884
6885 fill = width - self->length;
6886
6887 u = pad(self, fill, 0, '0');
6888
Walter Dörwald068325e2002-04-15 13:36:47 +00006889 if (u == NULL)
6890 return NULL;
6891
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 if (u->str[fill] == '+' || u->str[fill] == '-') {
6893 /* move sign to beginning of string */
6894 u->str[0] = u->str[fill];
6895 u->str[fill] = '0';
6896 }
6897
6898 return (PyObject*) u;
6899}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900
6901#if 0
6902static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006903unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 return PyInt_FromLong(unicode_freelist_size);
6906}
6907#endif
6908
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006909PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006910"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006912Return True if S starts with the specified prefix, False otherwise.\n\
6913With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00006914With optional end, stop comparing S at that position.\n\
6915prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916
6917static PyObject *
6918unicode_startswith(PyUnicodeObject *self,
6919 PyObject *args)
6920{
Georg Brandl24250812006-06-09 18:45:48 +00006921 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006923 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006924 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00006925 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926
Georg Brandl24250812006-06-09 18:45:48 +00006927 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00006928 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00006930 if (PyTuple_Check(subobj)) {
6931 Py_ssize_t i;
6932 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6933 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6934 PyTuple_GET_ITEM(subobj, i));
6935 if (substring == NULL)
6936 return NULL;
6937 result = tailmatch(self, substring, start, end, -1);
6938 Py_DECREF(substring);
6939 if (result) {
6940 Py_RETURN_TRUE;
6941 }
6942 }
6943 /* nothing matched */
6944 Py_RETURN_FALSE;
6945 }
6946 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00006948 return NULL;
6949 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00006951 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952}
6953
6954
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006955PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006956"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006958Return True if S ends with the specified suffix, False otherwise.\n\
6959With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00006960With optional end, stop comparing S at that position.\n\
6961suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962
6963static PyObject *
6964unicode_endswith(PyUnicodeObject *self,
6965 PyObject *args)
6966{
Georg Brandl24250812006-06-09 18:45:48 +00006967 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006969 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006970 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00006971 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972
Georg Brandl24250812006-06-09 18:45:48 +00006973 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
6974 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00006976 if (PyTuple_Check(subobj)) {
6977 Py_ssize_t i;
6978 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6979 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6980 PyTuple_GET_ITEM(subobj, i));
6981 if (substring == NULL)
6982 return NULL;
6983 result = tailmatch(self, substring, start, end, +1);
6984 Py_DECREF(substring);
6985 if (result) {
6986 Py_RETURN_TRUE;
6987 }
6988 }
6989 Py_RETURN_FALSE;
6990 }
6991 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00006993 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994
Georg Brandl24250812006-06-09 18:45:48 +00006995 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00006997 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998}
6999
7000
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007001
7002static PyObject *
7003unicode_getnewargs(PyUnicodeObject *v)
7004{
7005 return Py_BuildValue("(u#)", v->str, v->length);
7006}
7007
7008
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009static PyMethodDef unicode_methods[] = {
7010
7011 /* Order is according to common usage: often used methods should
7012 appear first, since lookup is done sequentially. */
7013
Georg Brandlecdc0a92006-03-30 12:19:07 +00007014 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007015 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7016 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007017 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007018 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7019 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7020 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7021 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7022 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7023 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7024 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007025 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007026 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7027 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7028 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007029 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007030 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007031/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7032 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7033 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7034 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007035 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007036 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007037 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007038 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007039 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7040 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7041 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7042 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7043 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7044 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7045 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7046 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7047 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7048 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7049 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7050 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7051 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7052 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007053 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007054#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007055 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056#endif
7057
7058#if 0
7059 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007060 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061#endif
7062
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007063 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 {NULL, NULL}
7065};
7066
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007067static PyObject *
7068unicode_mod(PyObject *v, PyObject *w)
7069{
7070 if (!PyUnicode_Check(v)) {
7071 Py_INCREF(Py_NotImplemented);
7072 return Py_NotImplemented;
7073 }
7074 return PyUnicode_Format(v, w);
7075}
7076
7077static PyNumberMethods unicode_as_number = {
7078 0, /*nb_add*/
7079 0, /*nb_subtract*/
7080 0, /*nb_multiply*/
7081 0, /*nb_divide*/
7082 unicode_mod, /*nb_remainder*/
7083};
7084
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007086 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007087 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007088 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7089 (ssizeargfunc) unicode_getitem, /* sq_item */
7090 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091 0, /* sq_ass_item */
7092 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007093 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094};
7095
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007096static PyObject*
7097unicode_subscript(PyUnicodeObject* self, PyObject* item)
7098{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007099 if (PyIndex_Check(item)) {
7100 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007101 if (i == -1 && PyErr_Occurred())
7102 return NULL;
7103 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007104 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007105 return unicode_getitem(self, i);
7106 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007107 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007108 Py_UNICODE* source_buf;
7109 Py_UNICODE* result_buf;
7110 PyObject* result;
7111
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007112 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007113 &start, &stop, &step, &slicelength) < 0) {
7114 return NULL;
7115 }
7116
7117 if (slicelength <= 0) {
7118 return PyUnicode_FromUnicode(NULL, 0);
7119 } else {
7120 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00007121 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7122 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007123
7124 if (result_buf == NULL)
7125 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007126
7127 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7128 result_buf[i] = source_buf[cur];
7129 }
Tim Petersced69f82003-09-16 20:30:58 +00007130
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007131 result = PyUnicode_FromUnicode(result_buf, slicelength);
7132 PyMem_FREE(result_buf);
7133 return result;
7134 }
7135 } else {
7136 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7137 return NULL;
7138 }
7139}
7140
7141static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007142 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007143 (binaryfunc)unicode_subscript, /* mp_subscript */
7144 (objobjargproc)0, /* mp_ass_subscript */
7145};
7146
Martin v. Löwis18e16552006-02-15 17:27:45 +00007147static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007149 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150 const void **ptr)
7151{
7152 if (index != 0) {
7153 PyErr_SetString(PyExc_SystemError,
7154 "accessing non-existent unicode segment");
7155 return -1;
7156 }
7157 *ptr = (void *) self->str;
7158 return PyUnicode_GET_DATA_SIZE(self);
7159}
7160
Martin v. Löwis18e16552006-02-15 17:27:45 +00007161static Py_ssize_t
7162unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163 const void **ptr)
7164{
7165 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007166 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167 return -1;
7168}
7169
7170static int
7171unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007172 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173{
7174 if (lenp)
7175 *lenp = PyUnicode_GET_DATA_SIZE(self);
7176 return 1;
7177}
7178
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007179static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007181 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182 const void **ptr)
7183{
7184 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007185
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186 if (index != 0) {
7187 PyErr_SetString(PyExc_SystemError,
7188 "accessing non-existent unicode segment");
7189 return -1;
7190 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007191 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192 if (str == NULL)
7193 return -1;
7194 *ptr = (void *) PyString_AS_STRING(str);
7195 return PyString_GET_SIZE(str);
7196}
7197
7198/* Helpers for PyUnicode_Format() */
7199
7200static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007201getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007203 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204 if (argidx < arglen) {
7205 (*p_argidx)++;
7206 if (arglen < 0)
7207 return args;
7208 else
7209 return PyTuple_GetItem(args, argidx);
7210 }
7211 PyErr_SetString(PyExc_TypeError,
7212 "not enough arguments for format string");
7213 return NULL;
7214}
7215
7216#define F_LJUST (1<<0)
7217#define F_SIGN (1<<1)
7218#define F_BLANK (1<<2)
7219#define F_ALT (1<<3)
7220#define F_ZERO (1<<4)
7221
Martin v. Löwis18e16552006-02-15 17:27:45 +00007222static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007223strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007225 register Py_ssize_t i;
7226 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227 for (i = len - 1; i >= 0; i--)
7228 buffer[i] = (Py_UNICODE) charbuffer[i];
7229
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230 return len;
7231}
7232
Neal Norwitzfc76d632006-01-10 06:03:13 +00007233static int
7234doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7235{
Tim Peters15231542006-02-16 01:08:01 +00007236 Py_ssize_t result;
7237
Neal Norwitzfc76d632006-01-10 06:03:13 +00007238 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007239 result = strtounicode(buffer, (char *)buffer);
7240 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007241}
7242
7243static int
7244longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7245{
Tim Peters15231542006-02-16 01:08:01 +00007246 Py_ssize_t result;
7247
Neal Norwitzfc76d632006-01-10 06:03:13 +00007248 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007249 result = strtounicode(buffer, (char *)buffer);
7250 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007251}
7252
Guido van Rossum078151d2002-08-11 04:24:12 +00007253/* XXX To save some code duplication, formatfloat/long/int could have been
7254 shared with stringobject.c, converting from 8-bit to Unicode after the
7255 formatting is done. */
7256
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257static int
7258formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007259 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260 int flags,
7261 int prec,
7262 int type,
7263 PyObject *v)
7264{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007265 /* fmt = '%#.' + `prec` + `type`
7266 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267 char fmt[20];
7268 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007269
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270 x = PyFloat_AsDouble(v);
7271 if (x == -1.0 && PyErr_Occurred())
7272 return -1;
7273 if (prec < 0)
7274 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7276 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007277 /* Worst case length calc to ensure no buffer overrun:
7278
7279 'g' formats:
7280 fmt = %#.<prec>g
7281 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7282 for any double rep.)
7283 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7284
7285 'f' formats:
7286 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7287 len = 1 + 50 + 1 + prec = 52 + prec
7288
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007289 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007290 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007291
7292 */
Georg Brandlc5db9232007-07-12 08:38:04 +00007293 if (((type == 'g' || type == 'G') &&
7294 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007295 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007296 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007297 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007298 return -1;
7299 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007300 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7301 (flags&F_ALT) ? "#" : "",
7302 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007303 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304}
7305
Tim Peters38fd5b62000-09-21 05:43:11 +00007306static PyObject*
7307formatlong(PyObject *val, int flags, int prec, int type)
7308{
7309 char *buf;
7310 int i, len;
7311 PyObject *str; /* temporary string object. */
7312 PyUnicodeObject *result;
7313
7314 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7315 if (!str)
7316 return NULL;
7317 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007318 if (!result) {
7319 Py_DECREF(str);
7320 return NULL;
7321 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007322 for (i = 0; i < len; i++)
7323 result->str[i] = buf[i];
7324 result->str[len] = 0;
7325 Py_DECREF(str);
7326 return (PyObject*)result;
7327}
7328
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329static int
7330formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007331 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332 int flags,
7333 int prec,
7334 int type,
7335 PyObject *v)
7336{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007337 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007338 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7339 * + 1 + 1
7340 * = 24
7341 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007342 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007343 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344 long x;
7345
7346 x = PyInt_AsLong(v);
7347 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007348 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007349 if (x < 0 && type == 'u') {
7350 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007351 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007352 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7353 sign = "-";
7354 else
7355 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007357 prec = 1;
7358
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007359 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7360 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007361 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007362 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007363 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007364 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007365 return -1;
7366 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007367
7368 if ((flags & F_ALT) &&
7369 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007370 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007371 * of issues that cause pain:
7372 * - when 0 is being converted, the C standard leaves off
7373 * the '0x' or '0X', which is inconsistent with other
7374 * %#x/%#X conversions and inconsistent with Python's
7375 * hex() function
7376 * - there are platforms that violate the standard and
7377 * convert 0 with the '0x' or '0X'
7378 * (Metrowerks, Compaq Tru64)
7379 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007380 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007381 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007382 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007383 * We can achieve the desired consistency by inserting our
7384 * own '0x' or '0X' prefix, and substituting %x/%X in place
7385 * of %#x/%#X.
7386 *
7387 * Note that this is the same approach as used in
7388 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007389 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007390 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7391 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007392 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007393 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007394 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7395 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007396 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007397 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007398 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007399 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007400 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007401 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402}
7403
7404static int
7405formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007406 size_t buflen,
7407 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007409 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007410 if (PyUnicode_Check(v)) {
7411 if (PyUnicode_GET_SIZE(v) != 1)
7412 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007414 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007416 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007417 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007418 goto onError;
7419 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7420 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421
7422 else {
7423 /* Integer input truncated to a character */
7424 long x;
7425 x = PyInt_AsLong(v);
7426 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007427 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007428#ifdef Py_UNICODE_WIDE
7429 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007430 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007431 "%c arg not in range(0x110000) "
7432 "(wide Python build)");
7433 return -1;
7434 }
7435#else
7436 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007437 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007438 "%c arg not in range(0x10000) "
7439 "(narrow Python build)");
7440 return -1;
7441 }
7442#endif
7443 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444 }
7445 buf[1] = '\0';
7446 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007447
7448 onError:
7449 PyErr_SetString(PyExc_TypeError,
7450 "%c requires int or char");
7451 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452}
7453
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007454/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7455
7456 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7457 chars are formatted. XXX This is a magic number. Each formatting
7458 routine does bounds checking to ensure no overflow, but a better
7459 solution may be to malloc a buffer of appropriate size for each
7460 format. For now, the current solution is sufficient.
7461*/
7462#define FORMATBUFLEN (size_t)120
7463
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464PyObject *PyUnicode_Format(PyObject *format,
7465 PyObject *args)
7466{
7467 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007468 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469 int args_owned = 0;
7470 PyUnicodeObject *result = NULL;
7471 PyObject *dict = NULL;
7472 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007473
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474 if (format == NULL || args == NULL) {
7475 PyErr_BadInternalCall();
7476 return NULL;
7477 }
7478 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007479 if (uformat == NULL)
7480 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481 fmt = PyUnicode_AS_UNICODE(uformat);
7482 fmtcnt = PyUnicode_GET_SIZE(uformat);
7483
7484 reslen = rescnt = fmtcnt + 100;
7485 result = _PyUnicode_New(reslen);
7486 if (result == NULL)
7487 goto onError;
7488 res = PyUnicode_AS_UNICODE(result);
7489
7490 if (PyTuple_Check(args)) {
7491 arglen = PyTuple_Size(args);
7492 argidx = 0;
7493 }
7494 else {
7495 arglen = -1;
7496 argidx = -2;
7497 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007498 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7499 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500 dict = args;
7501
7502 while (--fmtcnt >= 0) {
7503 if (*fmt != '%') {
7504 if (--rescnt < 0) {
7505 rescnt = fmtcnt + 100;
7506 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007507 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007508 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7510 --rescnt;
7511 }
7512 *res++ = *fmt++;
7513 }
7514 else {
7515 /* Got a format specifier */
7516 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007517 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519 Py_UNICODE c = '\0';
7520 Py_UNICODE fill;
7521 PyObject *v = NULL;
7522 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007523 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007525 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007526 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527
7528 fmt++;
7529 if (*fmt == '(') {
7530 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007531 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 PyObject *key;
7533 int pcount = 1;
7534
7535 if (dict == NULL) {
7536 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007537 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538 goto onError;
7539 }
7540 ++fmt;
7541 --fmtcnt;
7542 keystart = fmt;
7543 /* Skip over balanced parentheses */
7544 while (pcount > 0 && --fmtcnt >= 0) {
7545 if (*fmt == ')')
7546 --pcount;
7547 else if (*fmt == '(')
7548 ++pcount;
7549 fmt++;
7550 }
7551 keylen = fmt - keystart - 1;
7552 if (fmtcnt < 0 || pcount > 0) {
7553 PyErr_SetString(PyExc_ValueError,
7554 "incomplete format key");
7555 goto onError;
7556 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007557#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007558 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559 then looked up since Python uses strings to hold
7560 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007561 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562 key = PyUnicode_EncodeUTF8(keystart,
7563 keylen,
7564 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007565#else
7566 key = PyUnicode_FromUnicode(keystart, keylen);
7567#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568 if (key == NULL)
7569 goto onError;
7570 if (args_owned) {
7571 Py_DECREF(args);
7572 args_owned = 0;
7573 }
7574 args = PyObject_GetItem(dict, key);
7575 Py_DECREF(key);
7576 if (args == NULL) {
7577 goto onError;
7578 }
7579 args_owned = 1;
7580 arglen = -1;
7581 argidx = -2;
7582 }
7583 while (--fmtcnt >= 0) {
7584 switch (c = *fmt++) {
7585 case '-': flags |= F_LJUST; continue;
7586 case '+': flags |= F_SIGN; continue;
7587 case ' ': flags |= F_BLANK; continue;
7588 case '#': flags |= F_ALT; continue;
7589 case '0': flags |= F_ZERO; continue;
7590 }
7591 break;
7592 }
7593 if (c == '*') {
7594 v = getnextarg(args, arglen, &argidx);
7595 if (v == NULL)
7596 goto onError;
7597 if (!PyInt_Check(v)) {
7598 PyErr_SetString(PyExc_TypeError,
7599 "* wants int");
7600 goto onError;
7601 }
7602 width = PyInt_AsLong(v);
7603 if (width < 0) {
7604 flags |= F_LJUST;
7605 width = -width;
7606 }
7607 if (--fmtcnt >= 0)
7608 c = *fmt++;
7609 }
7610 else if (c >= '0' && c <= '9') {
7611 width = c - '0';
7612 while (--fmtcnt >= 0) {
7613 c = *fmt++;
7614 if (c < '0' || c > '9')
7615 break;
7616 if ((width*10) / 10 != width) {
7617 PyErr_SetString(PyExc_ValueError,
7618 "width too big");
7619 goto onError;
7620 }
7621 width = width*10 + (c - '0');
7622 }
7623 }
7624 if (c == '.') {
7625 prec = 0;
7626 if (--fmtcnt >= 0)
7627 c = *fmt++;
7628 if (c == '*') {
7629 v = getnextarg(args, arglen, &argidx);
7630 if (v == NULL)
7631 goto onError;
7632 if (!PyInt_Check(v)) {
7633 PyErr_SetString(PyExc_TypeError,
7634 "* wants int");
7635 goto onError;
7636 }
7637 prec = PyInt_AsLong(v);
7638 if (prec < 0)
7639 prec = 0;
7640 if (--fmtcnt >= 0)
7641 c = *fmt++;
7642 }
7643 else if (c >= '0' && c <= '9') {
7644 prec = c - '0';
7645 while (--fmtcnt >= 0) {
7646 c = Py_CHARMASK(*fmt++);
7647 if (c < '0' || c > '9')
7648 break;
7649 if ((prec*10) / 10 != prec) {
7650 PyErr_SetString(PyExc_ValueError,
7651 "prec too big");
7652 goto onError;
7653 }
7654 prec = prec*10 + (c - '0');
7655 }
7656 }
7657 } /* prec */
7658 if (fmtcnt >= 0) {
7659 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660 if (--fmtcnt >= 0)
7661 c = *fmt++;
7662 }
7663 }
7664 if (fmtcnt < 0) {
7665 PyErr_SetString(PyExc_ValueError,
7666 "incomplete format");
7667 goto onError;
7668 }
7669 if (c != '%') {
7670 v = getnextarg(args, arglen, &argidx);
7671 if (v == NULL)
7672 goto onError;
7673 }
7674 sign = 0;
7675 fill = ' ';
7676 switch (c) {
7677
7678 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007679 pbuf = formatbuf;
7680 /* presume that buffer length is at least 1 */
7681 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682 len = 1;
7683 break;
7684
7685 case 's':
7686 case 'r':
7687 if (PyUnicode_Check(v) && c == 's') {
7688 temp = v;
7689 Py_INCREF(temp);
7690 }
7691 else {
7692 PyObject *unicode;
7693 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007694 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695 else
7696 temp = PyObject_Repr(v);
7697 if (temp == NULL)
7698 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007699 if (PyUnicode_Check(temp))
7700 /* nothing to do */;
7701 else if (PyString_Check(temp)) {
7702 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007703 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007705 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007707 Py_DECREF(temp);
7708 temp = unicode;
7709 if (temp == NULL)
7710 goto onError;
7711 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007712 else {
7713 Py_DECREF(temp);
7714 PyErr_SetString(PyExc_TypeError,
7715 "%s argument has non-string str()");
7716 goto onError;
7717 }
7718 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007719 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 len = PyUnicode_GET_SIZE(temp);
7721 if (prec >= 0 && len > prec)
7722 len = prec;
7723 break;
7724
7725 case 'i':
7726 case 'd':
7727 case 'u':
7728 case 'o':
7729 case 'x':
7730 case 'X':
7731 if (c == 'i')
7732 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007733 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007734 temp = formatlong(v, flags, prec, c);
7735 if (!temp)
7736 goto onError;
7737 pbuf = PyUnicode_AS_UNICODE(temp);
7738 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007739 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007741 else {
7742 pbuf = formatbuf;
7743 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7744 flags, prec, c, v);
7745 if (len < 0)
7746 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007747 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007748 }
7749 if (flags & F_ZERO)
7750 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751 break;
7752
7753 case 'e':
7754 case 'E':
7755 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007756 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757 case 'g':
7758 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007759 if (c == 'F')
7760 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007761 pbuf = formatbuf;
7762 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7763 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764 if (len < 0)
7765 goto onError;
7766 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007767 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768 fill = '0';
7769 break;
7770
7771 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007772 pbuf = formatbuf;
7773 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774 if (len < 0)
7775 goto onError;
7776 break;
7777
7778 default:
7779 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007780 "unsupported format character '%c' (0x%x) "
Armin Rigo4b63c212006-10-04 11:44:06 +00007781 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007782 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007783 (int)c,
Armin Rigo4b63c212006-10-04 11:44:06 +00007784 (Py_ssize_t)(fmt - 1 -
7785 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786 goto onError;
7787 }
7788 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007789 if (*pbuf == '-' || *pbuf == '+') {
7790 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791 len--;
7792 }
7793 else if (flags & F_SIGN)
7794 sign = '+';
7795 else if (flags & F_BLANK)
7796 sign = ' ';
7797 else
7798 sign = 0;
7799 }
7800 if (width < len)
7801 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007802 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803 reslen -= rescnt;
7804 rescnt = width + fmtcnt + 100;
7805 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007806 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007807 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007808 PyErr_NoMemory();
7809 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007810 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007811 if (_PyUnicode_Resize(&result, reslen) < 0) {
7812 Py_XDECREF(temp);
7813 goto onError;
7814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815 res = PyUnicode_AS_UNICODE(result)
7816 + reslen - rescnt;
7817 }
7818 if (sign) {
7819 if (fill != ' ')
7820 *res++ = sign;
7821 rescnt--;
7822 if (width > len)
7823 width--;
7824 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007825 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7826 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007827 assert(pbuf[1] == c);
7828 if (fill != ' ') {
7829 *res++ = *pbuf++;
7830 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007831 }
Tim Petersfff53252001-04-12 18:38:48 +00007832 rescnt -= 2;
7833 width -= 2;
7834 if (width < 0)
7835 width = 0;
7836 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007837 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 if (width > len && !(flags & F_LJUST)) {
7839 do {
7840 --rescnt;
7841 *res++ = fill;
7842 } while (--width > len);
7843 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007844 if (fill == ' ') {
7845 if (sign)
7846 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007847 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007848 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007849 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007850 *res++ = *pbuf++;
7851 *res++ = *pbuf++;
7852 }
7853 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007854 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855 res += len;
7856 rescnt -= len;
7857 while (--width >= len) {
7858 --rescnt;
7859 *res++ = ' ';
7860 }
7861 if (dict && (argidx < arglen) && c != '%') {
7862 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007863 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007864 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865 goto onError;
7866 }
7867 Py_XDECREF(temp);
7868 } /* '%' */
7869 } /* until end */
7870 if (argidx < arglen && !dict) {
7871 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007872 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873 goto onError;
7874 }
7875
Thomas Woutersa96affe2006-03-12 00:29:36 +00007876 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7877 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878 if (args_owned) {
7879 Py_DECREF(args);
7880 }
7881 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882 return (PyObject *)result;
7883
7884 onError:
7885 Py_XDECREF(result);
7886 Py_DECREF(uformat);
7887 if (args_owned) {
7888 Py_DECREF(args);
7889 }
7890 return NULL;
7891}
7892
7893static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007894 (readbufferproc) unicode_buffer_getreadbuf,
7895 (writebufferproc) unicode_buffer_getwritebuf,
7896 (segcountproc) unicode_buffer_getsegcount,
7897 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898};
7899
Jeremy Hylton938ace62002-07-17 16:30:39 +00007900static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007901unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7902
Tim Peters6d6c1a32001-08-02 04:15:00 +00007903static PyObject *
7904unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7905{
7906 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007907 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007908 char *encoding = NULL;
7909 char *errors = NULL;
7910
Guido van Rossume023fe02001-08-30 03:12:59 +00007911 if (type != &PyUnicode_Type)
7912 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007913 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7914 kwlist, &x, &encoding, &errors))
7915 return NULL;
7916 if (x == NULL)
7917 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007918 if (encoding == NULL && errors == NULL)
7919 return PyObject_Unicode(x);
7920 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007921 return PyUnicode_FromEncodedObject(x, encoding, errors);
7922}
7923
Guido van Rossume023fe02001-08-30 03:12:59 +00007924static PyObject *
7925unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7926{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007927 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007928 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007929
7930 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7931 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7932 if (tmp == NULL)
7933 return NULL;
7934 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007935 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007936 if (pnew == NULL) {
7937 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007938 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007939 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007940 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7941 if (pnew->str == NULL) {
7942 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007943 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007944 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007945 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007946 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007947 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7948 pnew->length = n;
7949 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007950 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007951 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007952}
7953
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007954PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007955"unicode(string [, encoding[, errors]]) -> object\n\
7956\n\
7957Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007958encoding defaults to the current default string encoding.\n\
7959errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007960
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961PyTypeObject PyUnicode_Type = {
7962 PyObject_HEAD_INIT(&PyType_Type)
7963 0, /* ob_size */
7964 "unicode", /* tp_name */
7965 sizeof(PyUnicodeObject), /* tp_size */
7966 0, /* tp_itemsize */
7967 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007968 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007970 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00007972 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007973 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007974 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007976 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977 (hashfunc) unicode_hash, /* tp_hash*/
7978 0, /* tp_call*/
7979 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007980 PyObject_GenericGetAttr, /* tp_getattro */
7981 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007983 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7984 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007985 unicode_doc, /* tp_doc */
7986 0, /* tp_traverse */
7987 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00007988 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007989 0, /* tp_weaklistoffset */
7990 0, /* tp_iter */
7991 0, /* tp_iternext */
7992 unicode_methods, /* tp_methods */
7993 0, /* tp_members */
7994 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007995 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007996 0, /* tp_dict */
7997 0, /* tp_descr_get */
7998 0, /* tp_descr_set */
7999 0, /* tp_dictoffset */
8000 0, /* tp_init */
8001 0, /* tp_alloc */
8002 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008003 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004};
8005
8006/* Initialize the Unicode implementation */
8007
Thomas Wouters78890102000-07-22 19:25:51 +00008008void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008010 int i;
8011
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008012 /* XXX - move this array to unicodectype.c ? */
8013 Py_UNICODE linebreak[] = {
8014 0x000A, /* LINE FEED */
8015 0x000D, /* CARRIAGE RETURN */
8016 0x001C, /* FILE SEPARATOR */
8017 0x001D, /* GROUP SEPARATOR */
8018 0x001E, /* RECORD SEPARATOR */
8019 0x0085, /* NEXT LINE */
8020 0x2028, /* LINE SEPARATOR */
8021 0x2029, /* PARAGRAPH SEPARATOR */
8022 };
8023
Fred Drakee4315f52000-05-09 19:53:39 +00008024 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008025 unicode_freelist = NULL;
8026 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008028 if (!unicode_empty)
8029 return;
8030
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008031 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008032 for (i = 0; i < 256; i++)
8033 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008034 if (PyType_Ready(&PyUnicode_Type) < 0)
8035 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008036
8037 /* initialize the linebreak bloom filter */
8038 bloom_linebreak = make_bloom_mask(
8039 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8040 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008041
8042 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043}
8044
8045/* Finalize the Unicode implementation */
8046
8047void
Thomas Wouters78890102000-07-22 19:25:51 +00008048_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008050 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008051 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008053 Py_XDECREF(unicode_empty);
8054 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008055
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008056 for (i = 0; i < 256; i++) {
8057 if (unicode_latin1[i]) {
8058 Py_DECREF(unicode_latin1[i]);
8059 unicode_latin1[i] = NULL;
8060 }
8061 }
8062
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008063 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064 PyUnicodeObject *v = u;
8065 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008066 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008067 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008068 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008069 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008071 unicode_freelist = NULL;
8072 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008074
Anthony Baxterac6bd462006-04-13 02:06:09 +00008075#ifdef __cplusplus
8076}
8077#endif
8078
8079
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008080/*
8081Local variables:
8082c-basic-offset: 4
8083indent-tabs-mode: nil
8084End:
8085*/