blob: eb5bdd8458c755e4065750b6df4d38ebbf7778a4 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
Fredrik Lundh77633512006-05-23 19:47:35 +0000166 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000187
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000199 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000205 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 PyErr_NoMemory();
207 return -1;
208 }
209 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000210 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000212 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 }
218 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000219
Guido van Rossumd57fd912000-03-10 22:53:23 +0000220 return 0;
221}
222
223/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000224 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
228
229*/
230
231static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233{
234 register PyUnicodeObject *unicode;
235
Andrew Dalkee0df7622006-05-27 11:04:36 +0000236 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
240 }
241
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000245 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000250 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000251 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 }
255 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000256 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000258 }
259 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266 }
267
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000268 if (!unicode->str) {
269 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000270 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000271 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000272 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
278 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000283 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000288 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290}
291
292static
Guido van Rossum9475a232001-10-05 20:51:39 +0000293void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000299 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 unicode->str = NULL;
301 unicode->length = 0;
302 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000306 }
307 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000313 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000314 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000315 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317}
318
Martin v. Löwis18e16552006-02-15 17:27:45 +0000319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000320{
321 register PyUnicodeObject *v;
322
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
327 }
328 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000329 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 PyErr_BadInternalCall();
331 return -1;
332 }
333
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000337 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000344 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000345 *unicode = (PyObject *)w;
346 return 0;
347 }
348
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360{
361 PyUnicodeObject *unicode;
362
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
366
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
371 }
372
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 if (!unicode)
380 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000381 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000382 unicode_latin1[*u] = unicode;
383 }
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
386 }
387 }
Tim Petersced69f82003-09-16 20:30:58 +0000388
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
392
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396
397 return (PyObject *)unicode;
398}
399
400#ifdef HAVE_WCHAR_H
401
402PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000403 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404{
405 PyUnicodeObject *unicode;
406
407 if (w == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
411
412 unicode = _PyUnicode_New(size);
413 if (!unicode)
414 return NULL;
415
416 /* Copy the wchar_t data into the new object */
417#ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000419#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000420 {
421 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000422 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000424 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 *u++ = *w++;
426 }
427#endif
428
429 return (PyObject *)unicode;
430}
431
Martin v. Löwis18e16552006-02-15 17:27:45 +0000432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433 wchar_t *w,
434 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435{
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
439 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000440
441 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000443 size = PyUnicode_GET_SIZE(unicode) + 1;
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445#ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w, unicode->str, size * sizeof(wchar_t));
447#else
448 {
449 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000450 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000452 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453 *w++ = *u++;
454 }
455#endif
456
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000457 if (size > PyUnicode_GET_SIZE(unicode))
458 return PyUnicode_GET_SIZE(unicode);
459 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 return size;
461}
462
463#endif
464
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000465PyObject *PyUnicode_FromOrdinal(int ordinal)
466{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000467 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000468
469#ifdef Py_UNICODE_WIDE
470 if (ordinal < 0 || ordinal > 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
474 return NULL;
475 }
476#else
477 if (ordinal < 0 || ordinal > 0xffff) {
478 PyErr_SetString(PyExc_ValueError,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
481 return NULL;
482 }
483#endif
484
Hye-Shik Chang40574832004-04-06 07:24:51 +0000485 s[0] = (Py_UNICODE)ordinal;
486 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000487}
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489PyObject *PyUnicode_FromObject(register PyObject *obj)
490{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj)) {
494 Py_INCREF(obj);
495 return obj;
496 }
497 if (PyUnicode_Check(obj)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501 PyUnicode_GET_SIZE(obj));
502 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000503 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504}
505
506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507 const char *encoding,
508 const char *errors)
509{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000511 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 if (obj == NULL) {
515 PyErr_BadInternalCall();
516 return NULL;
517 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000518
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000519#if 0
520 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
523 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000524
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
527
528 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529 if (PyUnicode_Check(obj)) {
530 if (encoding) {
531 PyErr_SetString(PyExc_TypeError,
532 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000533 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000536 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537#else
538 if (PyUnicode_Check(obj)) {
539 PyErr_SetString(PyExc_TypeError,
540 "decoding Unicode is not supported");
541 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000542 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000543#endif
544
545 /* Coerce object */
546 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000547 s = PyString_AS_STRING(obj);
548 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000549 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000550 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 "coercing to Unicode: need string or buffer, "
556 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000557 obj->ob_type->tp_name);
558 goto onError;
559 }
Tim Petersced69f82003-09-16 20:30:58 +0000560
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000561 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 if (len == 0) {
563 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000564 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 }
Tim Petersced69f82003-09-16 20:30:58 +0000566 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000567 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000568
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 return v;
570
571 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573}
574
575PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000576 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 const char *encoding,
578 const char *errors)
579{
580 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000581
582 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000583 encoding = PyUnicode_GetDefaultEncoding();
584
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000587 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000588 else if (strcmp(encoding, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s, size, errors);
593#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596
597 /* Decode via the codec registry */
598 buffer = PyBuffer_FromMemory((void *)s, size);
599 if (buffer == NULL)
600 goto onError;
601 unicode = PyCodec_Decode(buffer, encoding, errors);
602 if (unicode == NULL)
603 goto onError;
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000606 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607 unicode->ob_type->tp_name);
608 Py_DECREF(unicode);
609 goto onError;
610 }
611 Py_DECREF(buffer);
612 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000613
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 onError:
615 Py_XDECREF(buffer);
616 return NULL;
617}
618
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620 const char *encoding,
621 const char *errors)
622{
623 PyObject *v;
624
625 if (!PyUnicode_Check(unicode)) {
626 PyErr_BadArgument();
627 goto onError;
628 }
629
630 if (encoding == NULL)
631 encoding = PyUnicode_GetDefaultEncoding();
632
633 /* Decode via the codec registry */
634 v = PyCodec_Decode(unicode, encoding, errors);
635 if (v == NULL)
636 goto onError;
637 return v;
638
639 onError:
640 return NULL;
641}
642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 const char *encoding,
646 const char *errors)
647{
648 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = PyUnicode_FromUnicode(s, size);
651 if (unicode == NULL)
652 return NULL;
653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654 Py_DECREF(unicode);
655 return v;
656}
657
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659 const char *encoding,
660 const char *errors)
661{
662 PyObject *v;
663
664 if (!PyUnicode_Check(unicode)) {
665 PyErr_BadArgument();
666 goto onError;
667 }
668
669 if (encoding == NULL)
670 encoding = PyUnicode_GetDefaultEncoding();
671
672 /* Encode via the codec registry */
673 v = PyCodec_Encode(unicode, encoding, errors);
674 if (v == NULL)
675 goto onError;
676 return v;
677
678 onError:
679 return NULL;
680}
681
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683 const char *encoding,
684 const char *errors)
685{
686 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000687
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688 if (!PyUnicode_Check(unicode)) {
689 PyErr_BadArgument();
690 goto onError;
691 }
Fred Drakee4315f52000-05-09 19:53:39 +0000692
Tim Petersced69f82003-09-16 20:30:58 +0000693 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000694 encoding = PyUnicode_GetDefaultEncoding();
695
696 /* Shortcuts for common default encodings */
697 if (errors == NULL) {
698 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000699 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000700 else if (strcmp(encoding, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode);
705#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000706 else if (strcmp(encoding, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode);
708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709
710 /* Encode via the codec registry */
711 v = PyCodec_Encode(unicode, encoding, errors);
712 if (v == NULL)
713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 if (!PyString_Check(v)) {
715 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000716 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 v->ob_type->tp_name);
718 Py_DECREF(v);
719 goto onError;
720 }
721 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000722
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 onError:
724 return NULL;
725}
726
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728 const char *errors)
729{
730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
731
732 if (v)
733 return v;
734 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735 if (v && errors == NULL)
736 ((PyUnicodeObject *)unicode)->defenc = v;
737 return v;
738}
739
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
741{
742 if (!PyUnicode_Check(unicode)) {
743 PyErr_BadArgument();
744 goto onError;
745 }
746 return PyUnicode_AS_UNICODE(unicode);
747
748 onError:
749 return NULL;
750}
751
Martin v. Löwis18e16552006-02-15 17:27:45 +0000752Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753{
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
757 }
758 return PyUnicode_GET_SIZE(unicode);
759
760 onError:
761 return -1;
762}
763
Thomas Wouters78890102000-07-22 19:25:51 +0000764const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000765{
766 return unicode_default_encoding;
767}
768
769int PyUnicode_SetDefaultEncoding(const char *encoding)
770{
771 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000772
Fred Drakee4315f52000-05-09 19:53:39 +0000773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v = _PyCodec_Lookup(encoding);
776 if (v == NULL)
777 goto onError;
778 Py_DECREF(v);
779 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000780 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000781 sizeof(unicode_default_encoding));
782 return 0;
783
784 onError:
785 return -1;
786}
787
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000788/* error handling callback helper:
789 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000790 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000791 and adjust various state variables.
792 return 0 on success, -1 on error
793*/
794
795static
796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
799 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000800{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000801 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000802
803 PyObject *restuple = NULL;
804 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000805 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
806 Py_ssize_t requiredsize;
807 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000808 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000809 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000810 int res = -1;
811
812 if (*errorHandler == NULL) {
813 *errorHandler = PyCodec_LookupError(errors);
814 if (*errorHandler == NULL)
815 goto onError;
816 }
817
818 if (*exceptionObject == NULL) {
819 *exceptionObject = PyUnicodeDecodeError_Create(
820 encoding, input, insize, *startinpos, *endinpos, reason);
821 if (*exceptionObject == NULL)
822 goto onError;
823 }
824 else {
825 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
826 goto onError;
827 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
828 goto onError;
829 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
830 goto onError;
831 }
832
833 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
834 if (restuple == NULL)
835 goto onError;
836 if (!PyTuple_Check(restuple)) {
837 PyErr_Format(PyExc_TypeError, &argparse[4]);
838 goto onError;
839 }
840 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
841 goto onError;
842 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000843 newpos = insize+newpos;
844 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000845 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000846 goto onError;
847 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000848
849 /* need more space? (at least enough for what we
850 have+the replacement+the rest of the string (starting
851 at the new input position), so we won't have to check space
852 when there are no errors in the rest of the string) */
853 repptr = PyUnicode_AS_UNICODE(repunicode);
854 repsize = PyUnicode_GET_SIZE(repunicode);
855 requiredsize = *outpos + repsize + insize-newpos;
856 if (requiredsize > outsize) {
857 if (requiredsize<2*outsize)
858 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000859 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000860 goto onError;
861 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
862 }
863 *endinpos = newpos;
864 *inptr = input + newpos;
865 Py_UNICODE_COPY(*outptr, repptr, repsize);
866 *outptr += repsize;
867 *outpos += repsize;
868 /* we made it! */
869 res = 0;
870
871 onError:
872 Py_XDECREF(restuple);
873 return res;
874}
875
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000876/* --- UTF-7 Codec -------------------------------------------------------- */
877
878/* see RFC2152 for details */
879
Tim Petersced69f82003-09-16 20:30:58 +0000880static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881char utf7_special[128] = {
882 /* indicate whether a UTF-7 character is special i.e. cannot be directly
883 encoded:
884 0 - not special
885 1 - special
886 2 - whitespace (optional)
887 3 - RFC2152 Set O (optional) */
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
892 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
894 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
896
897};
898
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000899/* Note: The comparison (c) <= 0 is a trick to work-around gcc
900 warnings about the comparison always being false; since
901 utf7_special[0] is 1, we can safely make that one comparison
902 true */
903
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000904#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000905 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000906 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907 (encodeO && (utf7_special[(c)] == 3)))
908
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000909#define B64(n) \
910 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
911#define B64CHAR(c) \
912 (isalnum(c) || (c) == '+' || (c) == '/')
913#define UB64(c) \
914 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
915 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000916
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000917#define ENCODE(out, ch, bits) \
918 while (bits >= 6) { \
919 *out++ = B64(ch >> (bits-6)); \
920 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921 }
922
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000923#define DECODE(out, ch, bits, surrogate) \
924 while (bits >= 16) { \
925 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
926 bits -= 16; \
927 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000928 /* We have already generated an error for the high surrogate \
929 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000930 surrogate = 0; \
931 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000932 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000933 it in a 16-bit character */ \
934 surrogate = 1; \
935 errmsg = "code pairs are not supported"; \
936 goto utf7Error; \
937 } else { \
938 *out++ = outCh; \
939 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000940 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000941
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000943 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 const char *errors)
945{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000946 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000947 Py_ssize_t startinpos;
948 Py_ssize_t endinpos;
949 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950 const char *e;
951 PyUnicodeObject *unicode;
952 Py_UNICODE *p;
953 const char *errmsg = "";
954 int inShift = 0;
955 unsigned int bitsleft = 0;
956 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000957 int surrogate = 0;
958 PyObject *errorHandler = NULL;
959 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000960
961 unicode = _PyUnicode_New(size);
962 if (!unicode)
963 return NULL;
964 if (size == 0)
965 return (PyObject *)unicode;
966
967 p = unicode->str;
968 e = s + size;
969
970 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000971 Py_UNICODE ch;
972 restart:
973 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974
975 if (inShift) {
976 if ((ch == '-') || !B64CHAR(ch)) {
977 inShift = 0;
978 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000979
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
981 if (bitsleft >= 6) {
982 /* The shift sequence has a partial character in it. If
983 bitsleft < 6 then we could just classify it as padding
984 but that is not the case here */
985
986 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000987 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000988 }
989 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000990 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000991 here so indicate the potential of a misencoded character. */
992
993 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
994 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
995 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000996 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 }
998
999 if (ch == '-') {
1000 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001001 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 inShift = 1;
1003 }
1004 } else if (SPECIAL(ch,0,0)) {
1005 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001006 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 } else {
1008 *p++ = ch;
1009 }
1010 } else {
1011 charsleft = (charsleft << 6) | UB64(ch);
1012 bitsleft += 6;
1013 s++;
1014 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1015 }
1016 }
1017 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001019 s++;
1020 if (s < e && *s == '-') {
1021 s++;
1022 *p++ = '+';
1023 } else
1024 {
1025 inShift = 1;
1026 bitsleft = 0;
1027 }
1028 }
1029 else if (SPECIAL(ch,0,0)) {
1030 errmsg = "unexpected special character";
1031 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001032 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001033 }
1034 else {
1035 *p++ = ch;
1036 s++;
1037 }
1038 continue;
1039 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001040 outpos = p-PyUnicode_AS_UNICODE(unicode);
1041 endinpos = s-starts;
1042 if (unicode_decode_call_errorhandler(
1043 errors, &errorHandler,
1044 "utf7", errmsg,
1045 starts, size, &startinpos, &endinpos, &exc, &s,
1046 (PyObject **)&unicode, &outpos, &p))
1047 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001048 }
1049
1050 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001051 outpos = p-PyUnicode_AS_UNICODE(unicode);
1052 endinpos = size;
1053 if (unicode_decode_call_errorhandler(
1054 errors, &errorHandler,
1055 "utf7", "unterminated shift sequence",
1056 starts, size, &startinpos, &endinpos, &exc, &s,
1057 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001059 if (s < e)
1060 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001063 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001064 goto onError;
1065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001066 Py_XDECREF(errorHandler);
1067 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 return (PyObject *)unicode;
1069
1070onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071 Py_XDECREF(errorHandler);
1072 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 Py_DECREF(unicode);
1074 return NULL;
1075}
1076
1077
1078PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001079 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080 int encodeSetO,
1081 int encodeWhiteSpace,
1082 const char *errors)
1083{
1084 PyObject *v;
1085 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001086 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001087 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001089 unsigned int bitsleft = 0;
1090 unsigned long charsleft = 0;
1091 char * out;
1092 char * start;
1093
1094 if (size == 0)
1095 return PyString_FromStringAndSize(NULL, 0);
1096
1097 v = PyString_FromStringAndSize(NULL, cbAllocated);
1098 if (v == NULL)
1099 return NULL;
1100
1101 start = out = PyString_AS_STRING(v);
1102 for (;i < size; ++i) {
1103 Py_UNICODE ch = s[i];
1104
1105 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001106 if (ch == '+') {
1107 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001108 *out++ = '-';
1109 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1110 charsleft = ch;
1111 bitsleft = 16;
1112 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001113 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001114 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001115 } else {
1116 *out++ = (char) ch;
1117 }
1118 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1120 *out++ = B64(charsleft << (6-bitsleft));
1121 charsleft = 0;
1122 bitsleft = 0;
1123 /* Characters not in the BASE64 set implicitly unshift the sequence
1124 so no '-' is required, except if the character is itself a '-' */
1125 if (B64CHAR(ch) || ch == '-') {
1126 *out++ = '-';
1127 }
1128 inShift = 0;
1129 *out++ = (char) ch;
1130 } else {
1131 bitsleft += 16;
1132 charsleft = (charsleft << 16) | ch;
1133 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1134
1135 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001136 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001137 or '-' then the shift sequence will be terminated implicitly and we
1138 don't have to insert a '-'. */
1139
1140 if (bitsleft == 0) {
1141 if (i + 1 < size) {
1142 Py_UNICODE ch2 = s[i+1];
1143
1144 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001145
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001146 } else if (B64CHAR(ch2) || ch2 == '-') {
1147 *out++ = '-';
1148 inShift = 0;
1149 } else {
1150 inShift = 0;
1151 }
1152
1153 }
1154 else {
1155 *out++ = '-';
1156 inShift = 0;
1157 }
1158 }
Tim Petersced69f82003-09-16 20:30:58 +00001159 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001160 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001161 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001162 if (bitsleft) {
1163 *out++= B64(charsleft << (6-bitsleft) );
1164 *out++ = '-';
1165 }
1166
Tim Peters5de98422002-04-27 18:44:32 +00001167 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001168 return v;
1169}
1170
1171#undef SPECIAL
1172#undef B64
1173#undef B64CHAR
1174#undef UB64
1175#undef ENCODE
1176#undef DECODE
1177
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178/* --- UTF-8 Codec -------------------------------------------------------- */
1179
Tim Petersced69f82003-09-16 20:30:58 +00001180static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181char utf8_code_length[256] = {
1182 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1183 illegal prefix. see RFC 2279 for details */
1184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1200};
1201
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001203 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 const char *errors)
1205{
Walter Dörwald69652032004-09-07 20:24:22 +00001206 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1207}
1208
1209PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001210 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001211 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001212 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001213{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001214 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001216 Py_ssize_t startinpos;
1217 Py_ssize_t endinpos;
1218 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 const char *e;
1220 PyUnicodeObject *unicode;
1221 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001222 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001223 PyObject *errorHandler = NULL;
1224 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
1226 /* Note: size will always be longer than the resulting Unicode
1227 character count */
1228 unicode = _PyUnicode_New(size);
1229 if (!unicode)
1230 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001231 if (size == 0) {
1232 if (consumed)
1233 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236
1237 /* Unpack UTF-8 encoded data */
1238 p = unicode->str;
1239 e = s + size;
1240
1241 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001242 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243
1244 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001245 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 s++;
1247 continue;
1248 }
1249
1250 n = utf8_code_length[ch];
1251
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001252 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001253 if (consumed)
1254 break;
1255 else {
1256 errmsg = "unexpected end of data";
1257 startinpos = s-starts;
1258 endinpos = size;
1259 goto utf8Error;
1260 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262
1263 switch (n) {
1264
1265 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001266 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001267 startinpos = s-starts;
1268 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001269 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001270
1271 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001272 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273 startinpos = s-starts;
1274 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001275 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276
1277 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001278 if ((s[1] & 0xc0) != 0x80) {
1279 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
1283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 errmsg = "illegal encoding";
1289 goto utf8Error;
1290 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001292 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 break;
1294
1295 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001296 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 (s[2] & 0xc0) != 0x80) {
1298 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299 startinpos = s-starts;
1300 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 goto utf8Error;
1302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001304 if (ch < 0x0800) {
1305 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001306 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001307
1308 XXX For wide builds (UCS-4) we should probably try
1309 to recombine the surrogates into a single code
1310 unit.
1311 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001312 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001313 startinpos = s-starts;
1314 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001315 goto utf8Error;
1316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001318 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001319 break;
1320
1321 case 4:
1322 if ((s[1] & 0xc0) != 0x80 ||
1323 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001324 (s[3] & 0xc0) != 0x80) {
1325 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 startinpos = s-starts;
1327 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001328 goto utf8Error;
1329 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001330 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1331 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1332 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001333 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001334 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001335 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001336 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001337 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001338 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 startinpos = s-starts;
1340 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001341 goto utf8Error;
1342 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001343#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001344 *p++ = (Py_UNICODE)ch;
1345#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001346 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001347
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 /* translate from 10000..10FFFF to 0..FFFF */
1349 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001350
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001351 /* high surrogate = top 10 bits added to D800 */
1352 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001353
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001354 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001355 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001356#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 break;
1358
1359 default:
1360 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001361 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001362 startinpos = s-starts;
1363 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001364 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 }
1366 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001367 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001368
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001370 outpos = p-PyUnicode_AS_UNICODE(unicode);
1371 if (unicode_decode_call_errorhandler(
1372 errors, &errorHandler,
1373 "utf8", errmsg,
1374 starts, size, &startinpos, &endinpos, &exc, &s,
1375 (PyObject **)&unicode, &outpos, &p))
1376 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 }
Walter Dörwald69652032004-09-07 20:24:22 +00001378 if (consumed)
1379 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380
1381 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001382 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001383 goto onError;
1384
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001385 Py_XDECREF(errorHandler);
1386 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 return (PyObject *)unicode;
1388
1389onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 Py_XDECREF(errorHandler);
1391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 Py_DECREF(unicode);
1393 return NULL;
1394}
1395
Tim Peters602f7402002-04-27 18:03:26 +00001396/* Allocation strategy: if the string is short, convert into a stack buffer
1397 and allocate exactly as much space needed at the end. Else allocate the
1398 maximum possible needed (4 result bytes per Unicode character), and return
1399 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001400*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001401PyObject *
1402PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001403 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001404 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405{
Tim Peters602f7402002-04-27 18:03:26 +00001406#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001407
Martin v. Löwis18e16552006-02-15 17:27:45 +00001408 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001409 PyObject *v; /* result string object */
1410 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001412 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001413 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001414
Tim Peters602f7402002-04-27 18:03:26 +00001415 assert(s != NULL);
1416 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417
Tim Peters602f7402002-04-27 18:03:26 +00001418 if (size <= MAX_SHORT_UNICHARS) {
1419 /* Write into the stack buffer; nallocated can't overflow.
1420 * At the end, we'll allocate exactly as much heap space as it
1421 * turns out we need.
1422 */
1423 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1424 v = NULL; /* will allocate after we're done */
1425 p = stackbuf;
1426 }
1427 else {
1428 /* Overallocate on the heap, and give the excess back at the end. */
1429 nallocated = size * 4;
1430 if (nallocated / 4 != size) /* overflow! */
1431 return PyErr_NoMemory();
1432 v = PyString_FromStringAndSize(NULL, nallocated);
1433 if (v == NULL)
1434 return NULL;
1435 p = PyString_AS_STRING(v);
1436 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001437
Tim Peters602f7402002-04-27 18:03:26 +00001438 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001439 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001440
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001441 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001442 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001444
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001446 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001447 *p++ = (char)(0xc0 | (ch >> 6));
1448 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001449 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001450 else {
Tim Peters602f7402002-04-27 18:03:26 +00001451 /* Encode UCS2 Unicode ordinals */
1452 if (ch < 0x10000) {
1453 /* Special case: check for high surrogate */
1454 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1455 Py_UCS4 ch2 = s[i];
1456 /* Check for low surrogate and combine the two to
1457 form a UCS4 value */
1458 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001459 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001460 i++;
1461 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001462 }
Tim Peters602f7402002-04-27 18:03:26 +00001463 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001464 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001465 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001466 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1467 *p++ = (char)(0x80 | (ch & 0x3f));
1468 continue;
1469 }
1470encodeUCS4:
1471 /* Encode UCS4 Unicode ordinals */
1472 *p++ = (char)(0xf0 | (ch >> 18));
1473 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1474 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1475 *p++ = (char)(0x80 | (ch & 0x3f));
1476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001478
Tim Peters602f7402002-04-27 18:03:26 +00001479 if (v == NULL) {
1480 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001481 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001482 assert(nneeded <= nallocated);
1483 v = PyString_FromStringAndSize(stackbuf, nneeded);
1484 }
1485 else {
1486 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001487 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001488 assert(nneeded <= nallocated);
1489 _PyString_Resize(&v, nneeded);
1490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001492
Tim Peters602f7402002-04-27 18:03:26 +00001493#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494}
1495
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1497{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
1500 return NULL;
1501 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001502 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1503 PyUnicode_GET_SIZE(unicode),
1504 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505}
1506
1507/* --- UTF-16 Codec ------------------------------------------------------- */
1508
Tim Peters772747b2001-08-09 22:21:55 +00001509PyObject *
1510PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001511 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001512 const char *errors,
1513 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001514{
Walter Dörwald69652032004-09-07 20:24:22 +00001515 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1516}
1517
1518PyObject *
1519PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001520 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001521 const char *errors,
1522 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001523 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001524{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001526 Py_ssize_t startinpos;
1527 Py_ssize_t endinpos;
1528 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529 PyUnicodeObject *unicode;
1530 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001531 const unsigned char *q, *e;
1532 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001533 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001534 /* Offsets from q for retrieving byte pairs in the right order. */
1535#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536 int ihi = 1, ilo = 0;
1537#else
1538 int ihi = 0, ilo = 1;
1539#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 PyObject *errorHandler = NULL;
1541 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542
1543 /* Note: size will always be longer than the resulting Unicode
1544 character count */
1545 unicode = _PyUnicode_New(size);
1546 if (!unicode)
1547 return NULL;
1548 if (size == 0)
1549 return (PyObject *)unicode;
1550
1551 /* Unpack UTF-16 encoded data */
1552 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001553 q = (unsigned char *)s;
1554 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555
1556 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001557 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001559 /* Check for BOM marks (U+FEFF) in the input and adjust current
1560 byte order setting accordingly. In native mode, the leading BOM
1561 mark is skipped, in all other modes, it is copied to the output
1562 stream as-is (giving a ZWNBSP character). */
1563 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001564 if (size >= 2) {
1565 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001566#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001567 if (bom == 0xFEFF) {
1568 q += 2;
1569 bo = -1;
1570 }
1571 else if (bom == 0xFFFE) {
1572 q += 2;
1573 bo = 1;
1574 }
Tim Petersced69f82003-09-16 20:30:58 +00001575#else
Walter Dörwald69652032004-09-07 20:24:22 +00001576 if (bom == 0xFEFF) {
1577 q += 2;
1578 bo = 1;
1579 }
1580 else if (bom == 0xFFFE) {
1581 q += 2;
1582 bo = -1;
1583 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001584#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001585 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587
Tim Peters772747b2001-08-09 22:21:55 +00001588 if (bo == -1) {
1589 /* force LE */
1590 ihi = 1;
1591 ilo = 0;
1592 }
1593 else if (bo == 1) {
1594 /* force BE */
1595 ihi = 0;
1596 ilo = 1;
1597 }
1598
1599 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001601 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001602 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001603 if (consumed)
1604 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001605 errmsg = "truncated data";
1606 startinpos = ((const char *)q)-starts;
1607 endinpos = ((const char *)e)-starts;
1608 goto utf16Error;
1609 /* The remaining input chars are ignored if the callback
1610 chooses to skip the input */
1611 }
1612 ch = (q[ihi] << 8) | q[ilo];
1613
Tim Peters772747b2001-08-09 22:21:55 +00001614 q += 2;
1615
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 if (ch < 0xD800 || ch > 0xDFFF) {
1617 *p++ = ch;
1618 continue;
1619 }
1620
1621 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001622 if (q >= e) {
1623 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001624 startinpos = (((const char *)q)-2)-starts;
1625 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001626 goto utf16Error;
1627 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001628 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001629 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1630 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001631 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001632#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001633 *p++ = ch;
1634 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001635#else
1636 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001637#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001638 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001639 }
1640 else {
1641 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 startinpos = (((const char *)q)-4)-starts;
1643 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001644 goto utf16Error;
1645 }
1646
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001648 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 startinpos = (((const char *)q)-2)-starts;
1650 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001651 /* Fall through to report the error */
1652
1653 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001654 outpos = p-PyUnicode_AS_UNICODE(unicode);
1655 if (unicode_decode_call_errorhandler(
1656 errors, &errorHandler,
1657 "utf16", errmsg,
1658 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1659 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 }
1662
1663 if (byteorder)
1664 *byteorder = bo;
1665
Walter Dörwald69652032004-09-07 20:24:22 +00001666 if (consumed)
1667 *consumed = (const char *)q-starts;
1668
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001670 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671 goto onError;
1672
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001673 Py_XDECREF(errorHandler);
1674 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675 return (PyObject *)unicode;
1676
1677onError:
1678 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001679 Py_XDECREF(errorHandler);
1680 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 return NULL;
1682}
1683
Tim Peters772747b2001-08-09 22:21:55 +00001684PyObject *
1685PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001686 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001687 const char *errors,
1688 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001689{
1690 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001691 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001692#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001693 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001694#else
1695 const int pairs = 0;
1696#endif
Tim Peters772747b2001-08-09 22:21:55 +00001697 /* Offsets from p for storing byte pairs in the right order. */
1698#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699 int ihi = 1, ilo = 0;
1700#else
1701 int ihi = 0, ilo = 1;
1702#endif
1703
1704#define STORECHAR(CH) \
1705 do { \
1706 p[ihi] = ((CH) >> 8) & 0xff; \
1707 p[ilo] = (CH) & 0xff; \
1708 p += 2; \
1709 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001711#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001712 for (i = pairs = 0; i < size; i++)
1713 if (s[i] >= 0x10000)
1714 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001715#endif
Tim Petersced69f82003-09-16 20:30:58 +00001716 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001717 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718 if (v == NULL)
1719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720
Tim Peters772747b2001-08-09 22:21:55 +00001721 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001723 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001724 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001725 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001726
1727 if (byteorder == -1) {
1728 /* force LE */
1729 ihi = 1;
1730 ilo = 0;
1731 }
1732 else if (byteorder == 1) {
1733 /* force BE */
1734 ihi = 0;
1735 ilo = 1;
1736 }
1737
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001738 while (size-- > 0) {
1739 Py_UNICODE ch = *s++;
1740 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001741#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001742 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001743 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1744 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001746#endif
Tim Peters772747b2001-08-09 22:21:55 +00001747 STORECHAR(ch);
1748 if (ch2)
1749 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001752#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753}
1754
1755PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1756{
1757 if (!PyUnicode_Check(unicode)) {
1758 PyErr_BadArgument();
1759 return NULL;
1760 }
1761 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1762 PyUnicode_GET_SIZE(unicode),
1763 NULL,
1764 0);
1765}
1766
1767/* --- Unicode Escape Codec ----------------------------------------------- */
1768
Fredrik Lundh06d12682001-01-24 07:59:11 +00001769static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001770
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001772 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773 const char *errors)
1774{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001776 Py_ssize_t startinpos;
1777 Py_ssize_t endinpos;
1778 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001781 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001783 char* message;
1784 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001785 PyObject *errorHandler = NULL;
1786 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001787
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 /* Escaped strings will always be longer than the resulting
1789 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 length after conversion to the true value.
1791 (but if the error callback returns a long replacement string
1792 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 v = _PyUnicode_New(size);
1794 if (v == NULL)
1795 goto onError;
1796 if (size == 0)
1797 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001798
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001799 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001801
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 while (s < end) {
1803 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001804 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001805 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806
1807 /* Non-escape characters are interpreted as Unicode ordinals */
1808 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001809 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 continue;
1811 }
1812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001813 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 /* \ - Escapes */
1815 s++;
1816 switch (*s++) {
1817
1818 /* \x escapes */
1819 case '\n': break;
1820 case '\\': *p++ = '\\'; break;
1821 case '\'': *p++ = '\''; break;
1822 case '\"': *p++ = '\"'; break;
1823 case 'b': *p++ = '\b'; break;
1824 case 'f': *p++ = '\014'; break; /* FF */
1825 case 't': *p++ = '\t'; break;
1826 case 'n': *p++ = '\n'; break;
1827 case 'r': *p++ = '\r'; break;
1828 case 'v': *p++ = '\013'; break; /* VT */
1829 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1830
1831 /* \OOO (octal) escapes */
1832 case '0': case '1': case '2': case '3':
1833 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001834 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001836 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001838 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001840 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 break;
1842
Fredrik Lundhccc74732001-02-18 22:13:49 +00001843 /* hex escapes */
1844 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001846 digits = 2;
1847 message = "truncated \\xXX escape";
1848 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849
Fredrik Lundhccc74732001-02-18 22:13:49 +00001850 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001852 digits = 4;
1853 message = "truncated \\uXXXX escape";
1854 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001855
Fredrik Lundhccc74732001-02-18 22:13:49 +00001856 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001857 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001858 digits = 8;
1859 message = "truncated \\UXXXXXXXX escape";
1860 hexescape:
1861 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 outpos = p-PyUnicode_AS_UNICODE(v);
1863 if (s+digits>end) {
1864 endinpos = size;
1865 if (unicode_decode_call_errorhandler(
1866 errors, &errorHandler,
1867 "unicodeescape", "end of string in escape sequence",
1868 starts, size, &startinpos, &endinpos, &exc, &s,
1869 (PyObject **)&v, &outpos, &p))
1870 goto onError;
1871 goto nextByte;
1872 }
1873 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001874 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001875 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001876 endinpos = (s+i+1)-starts;
1877 if (unicode_decode_call_errorhandler(
1878 errors, &errorHandler,
1879 "unicodeescape", message,
1880 starts, size, &startinpos, &endinpos, &exc, &s,
1881 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001882 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001883 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001884 }
1885 chr = (chr<<4) & ~0xF;
1886 if (c >= '0' && c <= '9')
1887 chr += c - '0';
1888 else if (c >= 'a' && c <= 'f')
1889 chr += 10 + c - 'a';
1890 else
1891 chr += 10 + c - 'A';
1892 }
1893 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001894 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001895 /* _decoding_error will have already written into the
1896 target buffer. */
1897 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001898 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001899 /* when we get here, chr is a 32-bit unicode character */
1900 if (chr <= 0xffff)
1901 /* UCS-2 character */
1902 *p++ = (Py_UNICODE) chr;
1903 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001904 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001905 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001906#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001907 *p++ = chr;
1908#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001909 chr -= 0x10000L;
1910 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001911 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001913 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001914 endinpos = s-starts;
1915 outpos = p-PyUnicode_AS_UNICODE(v);
1916 if (unicode_decode_call_errorhandler(
1917 errors, &errorHandler,
1918 "unicodeescape", "illegal Unicode character",
1919 starts, size, &startinpos, &endinpos, &exc, &s,
1920 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001921 goto onError;
1922 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001923 break;
1924
1925 /* \N{name} */
1926 case 'N':
1927 message = "malformed \\N character escape";
1928 if (ucnhash_CAPI == NULL) {
1929 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001930 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001931 m = PyImport_ImportModule("unicodedata");
1932 if (m == NULL)
1933 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001934 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001935 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001936 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001937 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001938 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001939 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001940 if (ucnhash_CAPI == NULL)
1941 goto ucnhashError;
1942 }
1943 if (*s == '{') {
1944 const char *start = s+1;
1945 /* look for the closing brace */
1946 while (*s != '}' && s < end)
1947 s++;
1948 if (s > start && s < end && *s == '}') {
1949 /* found a name. look it up in the unicode database */
1950 message = "unknown Unicode character name";
1951 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001952 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001953 goto store;
1954 }
1955 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001956 endinpos = s-starts;
1957 outpos = p-PyUnicode_AS_UNICODE(v);
1958 if (unicode_decode_call_errorhandler(
1959 errors, &errorHandler,
1960 "unicodeescape", message,
1961 starts, size, &startinpos, &endinpos, &exc, &s,
1962 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001963 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001964 break;
1965
1966 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001967 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001968 message = "\\ at end of string";
1969 s--;
1970 endinpos = s-starts;
1971 outpos = p-PyUnicode_AS_UNICODE(v);
1972 if (unicode_decode_call_errorhandler(
1973 errors, &errorHandler,
1974 "unicodeescape", message,
1975 starts, size, &startinpos, &endinpos, &exc, &s,
1976 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001977 goto onError;
1978 }
1979 else {
1980 *p++ = '\\';
1981 *p++ = (unsigned char)s[-1];
1982 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001983 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001985 nextByte:
1986 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001988 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001989 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001990 Py_XDECREF(errorHandler);
1991 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001993
Fredrik Lundhccc74732001-02-18 22:13:49 +00001994ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001995 PyErr_SetString(
1996 PyExc_UnicodeError,
1997 "\\N escapes not supported (can't load unicodedata module)"
1998 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001999 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002000 Py_XDECREF(errorHandler);
2001 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002002 return NULL;
2003
Fredrik Lundhccc74732001-02-18 22:13:49 +00002004onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002006 Py_XDECREF(errorHandler);
2007 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 return NULL;
2009}
2010
2011/* Return a Unicode-Escape string version of the Unicode object.
2012
2013 If quotes is true, the string is enclosed in u"" or u'' quotes as
2014 appropriate.
2015
2016*/
2017
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002018Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002019 Py_ssize_t size,
2020 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002021{
2022 /* like wcschr, but doesn't stop at NULL characters */
2023
2024 while (size-- > 0) {
2025 if (*s == ch)
2026 return s;
2027 s++;
2028 }
2029
2030 return NULL;
2031}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002032
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033static
2034PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002035 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 int quotes)
2037{
2038 PyObject *repr;
2039 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002041 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042
2043 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
2044 if (repr == NULL)
2045 return NULL;
2046
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002047 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048
2049 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002051 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052 !findchar(s, size, '"')) ? '"' : '\'';
2053 }
2054 while (size-- > 0) {
2055 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002056
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002057 /* Escape quotes and backslashes */
2058 if ((quotes &&
2059 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 *p++ = '\\';
2061 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002062 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002063 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002064
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002065#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002066 /* Map 21-bit characters to '\U00xxxxxx' */
2067 else if (ch >= 0x10000) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00002068 Py_ssize_t offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002069
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002070 /* Resize the string if necessary */
2071 if (offset + 12 > PyString_GET_SIZE(repr)) {
2072 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002073 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002074 p = PyString_AS_STRING(repr) + offset;
2075 }
2076
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002077 *p++ = '\\';
2078 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002079 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2080 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2081 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2082 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2083 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2084 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2085 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002086 *p++ = hexdigit[ch & 0x0000000F];
2087 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002088 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002089#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002090 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2091 else if (ch >= 0xD800 && ch < 0xDC00) {
2092 Py_UNICODE ch2;
2093 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002094
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002095 ch2 = *s++;
2096 size--;
2097 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2098 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2099 *p++ = '\\';
2100 *p++ = 'U';
2101 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2102 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2103 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2104 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2105 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2106 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2107 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2108 *p++ = hexdigit[ucs & 0x0000000F];
2109 continue;
2110 }
2111 /* Fall through: isolated surrogates are copied as-is */
2112 s--;
2113 size++;
2114 }
2115
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002117 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118 *p++ = '\\';
2119 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002120 *p++ = hexdigit[(ch >> 12) & 0x000F];
2121 *p++ = hexdigit[(ch >> 8) & 0x000F];
2122 *p++ = hexdigit[(ch >> 4) & 0x000F];
2123 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002125
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002126 /* Map special whitespace to '\t', \n', '\r' */
2127 else if (ch == '\t') {
2128 *p++ = '\\';
2129 *p++ = 't';
2130 }
2131 else if (ch == '\n') {
2132 *p++ = '\\';
2133 *p++ = 'n';
2134 }
2135 else if (ch == '\r') {
2136 *p++ = '\\';
2137 *p++ = 'r';
2138 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002139
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002140 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002141 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002143 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002144 *p++ = hexdigit[(ch >> 4) & 0x000F];
2145 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002146 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002147
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 /* Copy everything else as-is */
2149 else
2150 *p++ = (char) ch;
2151 }
2152 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002153 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154
2155 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002156 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002157 return repr;
2158}
2159
2160PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002161 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162{
2163 return unicodeescape_string(s, size, 0);
2164}
2165
2166PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2167{
2168 if (!PyUnicode_Check(unicode)) {
2169 PyErr_BadArgument();
2170 return NULL;
2171 }
2172 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2173 PyUnicode_GET_SIZE(unicode));
2174}
2175
2176/* --- Raw Unicode Escape Codec ------------------------------------------- */
2177
2178PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002179 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 const char *errors)
2181{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002182 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002183 Py_ssize_t startinpos;
2184 Py_ssize_t endinpos;
2185 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002187 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188 const char *end;
2189 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002190 PyObject *errorHandler = NULL;
2191 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002192
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193 /* Escaped strings will always be longer than the resulting
2194 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002195 length after conversion to the true value. (But decoding error
2196 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 v = _PyUnicode_New(size);
2198 if (v == NULL)
2199 goto onError;
2200 if (size == 0)
2201 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002202 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203 end = s + size;
2204 while (s < end) {
2205 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002206 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002208 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209
2210 /* Non-escape characters are interpreted as Unicode ordinals */
2211 if (*s != '\\') {
2212 *p++ = (unsigned char)*s++;
2213 continue;
2214 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002215 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216
2217 /* \u-escapes are only interpreted iff the number of leading
2218 backslashes if odd */
2219 bs = s;
2220 for (;s < end;) {
2221 if (*s != '\\')
2222 break;
2223 *p++ = (unsigned char)*s++;
2224 }
2225 if (((s - bs) & 1) == 0 ||
2226 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002227 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 continue;
2229 }
2230 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002231 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 s++;
2233
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002234 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002235 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002236 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002237 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002239 endinpos = s-starts;
2240 if (unicode_decode_call_errorhandler(
2241 errors, &errorHandler,
2242 "rawunicodeescape", "truncated \\uXXXX",
2243 starts, size, &startinpos, &endinpos, &exc, &s,
2244 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002246 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247 }
2248 x = (x<<4) & ~0xF;
2249 if (c >= '0' && c <= '9')
2250 x += c - '0';
2251 else if (c >= 'a' && c <= 'f')
2252 x += 10 + c - 'a';
2253 else
2254 x += 10 + c - 'A';
2255 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002256#ifndef Py_UNICODE_WIDE
2257 if (x > 0x10000) {
2258 if (unicode_decode_call_errorhandler(
2259 errors, &errorHandler,
2260 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2261 starts, size, &startinpos, &endinpos, &exc, &s,
2262 (PyObject **)&v, &outpos, &p))
2263 goto onError;
2264 }
2265#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266 *p++ = x;
2267 nextByte:
2268 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002270 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002271 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002272 Py_XDECREF(errorHandler);
2273 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002275
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 onError:
2277 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002278 Py_XDECREF(errorHandler);
2279 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002280 return NULL;
2281}
2282
2283PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002284 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285{
2286 PyObject *repr;
2287 char *p;
2288 char *q;
2289
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002290 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002292#ifdef Py_UNICODE_WIDE
2293 repr = PyString_FromStringAndSize(NULL, 10 * size);
2294#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002296#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297 if (repr == NULL)
2298 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002299 if (size == 0)
2300 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301
2302 p = q = PyString_AS_STRING(repr);
2303 while (size-- > 0) {
2304 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002305#ifdef Py_UNICODE_WIDE
2306 /* Map 32-bit characters to '\Uxxxxxxxx' */
2307 if (ch >= 0x10000) {
2308 *p++ = '\\';
2309 *p++ = 'U';
2310 *p++ = hexdigit[(ch >> 28) & 0xf];
2311 *p++ = hexdigit[(ch >> 24) & 0xf];
2312 *p++ = hexdigit[(ch >> 20) & 0xf];
2313 *p++ = hexdigit[(ch >> 16) & 0xf];
2314 *p++ = hexdigit[(ch >> 12) & 0xf];
2315 *p++ = hexdigit[(ch >> 8) & 0xf];
2316 *p++ = hexdigit[(ch >> 4) & 0xf];
2317 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002318 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002319 else
2320#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002321 /* Map 16-bit characters to '\uxxxx' */
2322 if (ch >= 256) {
2323 *p++ = '\\';
2324 *p++ = 'u';
2325 *p++ = hexdigit[(ch >> 12) & 0xf];
2326 *p++ = hexdigit[(ch >> 8) & 0xf];
2327 *p++ = hexdigit[(ch >> 4) & 0xf];
2328 *p++ = hexdigit[ch & 15];
2329 }
2330 /* Copy everything else as-is */
2331 else
2332 *p++ = (char) ch;
2333 }
2334 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002335 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002336 return repr;
2337}
2338
2339PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2340{
2341 if (!PyUnicode_Check(unicode)) {
2342 PyErr_BadArgument();
2343 return NULL;
2344 }
2345 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2346 PyUnicode_GET_SIZE(unicode));
2347}
2348
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002349/* --- Unicode Internal Codec ------------------------------------------- */
2350
2351PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002352 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002353 const char *errors)
2354{
2355 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002356 Py_ssize_t startinpos;
2357 Py_ssize_t endinpos;
2358 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002359 PyUnicodeObject *v;
2360 Py_UNICODE *p;
2361 const char *end;
2362 const char *reason;
2363 PyObject *errorHandler = NULL;
2364 PyObject *exc = NULL;
2365
Neal Norwitzd43069c2006-01-08 01:12:10 +00002366#ifdef Py_UNICODE_WIDE
2367 Py_UNICODE unimax = PyUnicode_GetMax();
2368#endif
2369
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002370 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2371 if (v == NULL)
2372 goto onError;
2373 if (PyUnicode_GetSize((PyObject *)v) == 0)
2374 return (PyObject *)v;
2375 p = PyUnicode_AS_UNICODE(v);
2376 end = s + size;
2377
2378 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002379 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002380 /* We have to sanity check the raw data, otherwise doom looms for
2381 some malformed UCS-4 data. */
2382 if (
2383 #ifdef Py_UNICODE_WIDE
2384 *p > unimax || *p < 0 ||
2385 #endif
2386 end-s < Py_UNICODE_SIZE
2387 )
2388 {
2389 startinpos = s - starts;
2390 if (end-s < Py_UNICODE_SIZE) {
2391 endinpos = end-starts;
2392 reason = "truncated input";
2393 }
2394 else {
2395 endinpos = s - starts + Py_UNICODE_SIZE;
2396 reason = "illegal code point (> 0x10FFFF)";
2397 }
2398 outpos = p - PyUnicode_AS_UNICODE(v);
2399 if (unicode_decode_call_errorhandler(
2400 errors, &errorHandler,
2401 "unicode_internal", reason,
2402 starts, size, &startinpos, &endinpos, &exc, &s,
2403 (PyObject **)&v, &outpos, &p)) {
2404 goto onError;
2405 }
2406 }
2407 else {
2408 p++;
2409 s += Py_UNICODE_SIZE;
2410 }
2411 }
2412
Martin v. Löwis412fb672006-04-13 06:34:32 +00002413 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002414 goto onError;
2415 Py_XDECREF(errorHandler);
2416 Py_XDECREF(exc);
2417 return (PyObject *)v;
2418
2419 onError:
2420 Py_XDECREF(v);
2421 Py_XDECREF(errorHandler);
2422 Py_XDECREF(exc);
2423 return NULL;
2424}
2425
Guido van Rossumd57fd912000-03-10 22:53:23 +00002426/* --- Latin-1 Codec ------------------------------------------------------ */
2427
2428PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002429 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002430 const char *errors)
2431{
2432 PyUnicodeObject *v;
2433 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002434
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002436 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002437 Py_UNICODE r = *(unsigned char*)s;
2438 return PyUnicode_FromUnicode(&r, 1);
2439 }
2440
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441 v = _PyUnicode_New(size);
2442 if (v == NULL)
2443 goto onError;
2444 if (size == 0)
2445 return (PyObject *)v;
2446 p = PyUnicode_AS_UNICODE(v);
2447 while (size-- > 0)
2448 *p++ = (unsigned char)*s++;
2449 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002450
Guido van Rossumd57fd912000-03-10 22:53:23 +00002451 onError:
2452 Py_XDECREF(v);
2453 return NULL;
2454}
2455
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002456/* create or adjust a UnicodeEncodeError */
2457static void make_encode_exception(PyObject **exceptionObject,
2458 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002459 const Py_UNICODE *unicode, Py_ssize_t size,
2460 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002461 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002462{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002463 if (*exceptionObject == NULL) {
2464 *exceptionObject = PyUnicodeEncodeError_Create(
2465 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466 }
2467 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002468 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2469 goto onError;
2470 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2471 goto onError;
2472 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2473 goto onError;
2474 return;
2475 onError:
2476 Py_DECREF(*exceptionObject);
2477 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 }
2479}
2480
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002481/* raises a UnicodeEncodeError */
2482static void raise_encode_exception(PyObject **exceptionObject,
2483 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002484 const Py_UNICODE *unicode, Py_ssize_t size,
2485 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002486 const char *reason)
2487{
2488 make_encode_exception(exceptionObject,
2489 encoding, unicode, size, startpos, endpos, reason);
2490 if (*exceptionObject != NULL)
2491 PyCodec_StrictErrors(*exceptionObject);
2492}
2493
2494/* error handling callback helper:
2495 build arguments, call the callback and check the arguments,
2496 put the result into newpos and return the replacement string, which
2497 has to be freed by the caller */
2498static PyObject *unicode_encode_call_errorhandler(const char *errors,
2499 PyObject **errorHandler,
2500 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002501 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2502 Py_ssize_t startpos, Py_ssize_t endpos,
2503 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002504{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002505 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002506
2507 PyObject *restuple;
2508 PyObject *resunicode;
2509
2510 if (*errorHandler == NULL) {
2511 *errorHandler = PyCodec_LookupError(errors);
2512 if (*errorHandler == NULL)
2513 return NULL;
2514 }
2515
2516 make_encode_exception(exceptionObject,
2517 encoding, unicode, size, startpos, endpos, reason);
2518 if (*exceptionObject == NULL)
2519 return NULL;
2520
2521 restuple = PyObject_CallFunctionObjArgs(
2522 *errorHandler, *exceptionObject, NULL);
2523 if (restuple == NULL)
2524 return NULL;
2525 if (!PyTuple_Check(restuple)) {
2526 PyErr_Format(PyExc_TypeError, &argparse[4]);
2527 Py_DECREF(restuple);
2528 return NULL;
2529 }
2530 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2531 &resunicode, newpos)) {
2532 Py_DECREF(restuple);
2533 return NULL;
2534 }
2535 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002536 *newpos = size+*newpos;
2537 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002538 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002539 Py_DECREF(restuple);
2540 return NULL;
2541 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002542 Py_INCREF(resunicode);
2543 Py_DECREF(restuple);
2544 return resunicode;
2545}
2546
2547static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002548 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002549 const char *errors,
2550 int limit)
2551{
2552 /* output object */
2553 PyObject *res;
2554 /* pointers to the beginning and end+1 of input */
2555 const Py_UNICODE *startp = p;
2556 const Py_UNICODE *endp = p + size;
2557 /* pointer to the beginning of the unencodable characters */
2558 /* const Py_UNICODE *badp = NULL; */
2559 /* pointer into the output */
2560 char *str;
2561 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002562 Py_ssize_t respos = 0;
2563 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002564 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2565 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002566 PyObject *errorHandler = NULL;
2567 PyObject *exc = NULL;
2568 /* the following variable is used for caching string comparisons
2569 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2570 int known_errorHandler = -1;
2571
2572 /* allocate enough for a simple encoding without
2573 replacements, if we need more, we'll resize */
2574 res = PyString_FromStringAndSize(NULL, size);
2575 if (res == NULL)
2576 goto onError;
2577 if (size == 0)
2578 return res;
2579 str = PyString_AS_STRING(res);
2580 ressize = size;
2581
2582 while (p<endp) {
2583 Py_UNICODE c = *p;
2584
2585 /* can we encode this? */
2586 if (c<limit) {
2587 /* no overflow check, because we know that the space is enough */
2588 *str++ = (char)c;
2589 ++p;
2590 }
2591 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002592 Py_ssize_t unicodepos = p-startp;
2593 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002594 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002595 Py_ssize_t repsize;
2596 Py_ssize_t newpos;
2597 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002598 Py_UNICODE *uni2;
2599 /* startpos for collecting unencodable chars */
2600 const Py_UNICODE *collstart = p;
2601 const Py_UNICODE *collend = p;
2602 /* find all unecodable characters */
2603 while ((collend < endp) && ((*collend)>=limit))
2604 ++collend;
2605 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2606 if (known_errorHandler==-1) {
2607 if ((errors==NULL) || (!strcmp(errors, "strict")))
2608 known_errorHandler = 1;
2609 else if (!strcmp(errors, "replace"))
2610 known_errorHandler = 2;
2611 else if (!strcmp(errors, "ignore"))
2612 known_errorHandler = 3;
2613 else if (!strcmp(errors, "xmlcharrefreplace"))
2614 known_errorHandler = 4;
2615 else
2616 known_errorHandler = 0;
2617 }
2618 switch (known_errorHandler) {
2619 case 1: /* strict */
2620 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2621 goto onError;
2622 case 2: /* replace */
2623 while (collstart++<collend)
2624 *str++ = '?'; /* fall through */
2625 case 3: /* ignore */
2626 p = collend;
2627 break;
2628 case 4: /* xmlcharrefreplace */
2629 respos = str-PyString_AS_STRING(res);
2630 /* determine replacement size (temporarily (mis)uses p) */
2631 for (p = collstart, repsize = 0; p < collend; ++p) {
2632 if (*p<10)
2633 repsize += 2+1+1;
2634 else if (*p<100)
2635 repsize += 2+2+1;
2636 else if (*p<1000)
2637 repsize += 2+3+1;
2638 else if (*p<10000)
2639 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002640#ifndef Py_UNICODE_WIDE
2641 else
2642 repsize += 2+5+1;
2643#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002644 else if (*p<100000)
2645 repsize += 2+5+1;
2646 else if (*p<1000000)
2647 repsize += 2+6+1;
2648 else
2649 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002650#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002651 }
2652 requiredsize = respos+repsize+(endp-collend);
2653 if (requiredsize > ressize) {
2654 if (requiredsize<2*ressize)
2655 requiredsize = 2*ressize;
2656 if (_PyString_Resize(&res, requiredsize))
2657 goto onError;
2658 str = PyString_AS_STRING(res) + respos;
2659 ressize = requiredsize;
2660 }
2661 /* generate replacement (temporarily (mis)uses p) */
2662 for (p = collstart; p < collend; ++p) {
2663 str += sprintf(str, "&#%d;", (int)*p);
2664 }
2665 p = collend;
2666 break;
2667 default:
2668 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2669 encoding, reason, startp, size, &exc,
2670 collstart-startp, collend-startp, &newpos);
2671 if (repunicode == NULL)
2672 goto onError;
2673 /* need more space? (at least enough for what we
2674 have+the replacement+the rest of the string, so
2675 we won't have to check space for encodable characters) */
2676 respos = str-PyString_AS_STRING(res);
2677 repsize = PyUnicode_GET_SIZE(repunicode);
2678 requiredsize = respos+repsize+(endp-collend);
2679 if (requiredsize > ressize) {
2680 if (requiredsize<2*ressize)
2681 requiredsize = 2*ressize;
2682 if (_PyString_Resize(&res, requiredsize)) {
2683 Py_DECREF(repunicode);
2684 goto onError;
2685 }
2686 str = PyString_AS_STRING(res) + respos;
2687 ressize = requiredsize;
2688 }
2689 /* check if there is anything unencodable in the replacement
2690 and copy it to the output */
2691 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2692 c = *uni2;
2693 if (c >= limit) {
2694 raise_encode_exception(&exc, encoding, startp, size,
2695 unicodepos, unicodepos+1, reason);
2696 Py_DECREF(repunicode);
2697 goto onError;
2698 }
2699 *str = (char)c;
2700 }
2701 p = startp + newpos;
2702 Py_DECREF(repunicode);
2703 }
2704 }
2705 }
2706 /* Resize if we allocated to much */
2707 respos = str-PyString_AS_STRING(res);
2708 if (respos<ressize)
2709 /* If this falls res will be NULL */
2710 _PyString_Resize(&res, respos);
2711 Py_XDECREF(errorHandler);
2712 Py_XDECREF(exc);
2713 return res;
2714
2715 onError:
2716 Py_XDECREF(res);
2717 Py_XDECREF(errorHandler);
2718 Py_XDECREF(exc);
2719 return NULL;
2720}
2721
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002723 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002724 const char *errors)
2725{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002726 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727}
2728
2729PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2730{
2731 if (!PyUnicode_Check(unicode)) {
2732 PyErr_BadArgument();
2733 return NULL;
2734 }
2735 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2736 PyUnicode_GET_SIZE(unicode),
2737 NULL);
2738}
2739
2740/* --- 7-bit ASCII Codec -------------------------------------------------- */
2741
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002743 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744 const char *errors)
2745{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747 PyUnicodeObject *v;
2748 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002749 Py_ssize_t startinpos;
2750 Py_ssize_t endinpos;
2751 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002752 const char *e;
2753 PyObject *errorHandler = NULL;
2754 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002755
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002757 if (size == 1 && *(unsigned char*)s < 128) {
2758 Py_UNICODE r = *(unsigned char*)s;
2759 return PyUnicode_FromUnicode(&r, 1);
2760 }
Tim Petersced69f82003-09-16 20:30:58 +00002761
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 v = _PyUnicode_New(size);
2763 if (v == NULL)
2764 goto onError;
2765 if (size == 0)
2766 return (PyObject *)v;
2767 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002768 e = s + size;
2769 while (s < e) {
2770 register unsigned char c = (unsigned char)*s;
2771 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002773 ++s;
2774 }
2775 else {
2776 startinpos = s-starts;
2777 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002778 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002779 if (unicode_decode_call_errorhandler(
2780 errors, &errorHandler,
2781 "ascii", "ordinal not in range(128)",
2782 starts, size, &startinpos, &endinpos, &exc, &s,
2783 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002787 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002788 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002789 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002790 Py_XDECREF(errorHandler);
2791 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002793
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794 onError:
2795 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002796 Py_XDECREF(errorHandler);
2797 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 return NULL;
2799}
2800
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002802 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 const char *errors)
2804{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002805 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806}
2807
2808PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2809{
2810 if (!PyUnicode_Check(unicode)) {
2811 PyErr_BadArgument();
2812 return NULL;
2813 }
2814 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2815 PyUnicode_GET_SIZE(unicode),
2816 NULL);
2817}
2818
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002819#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002820
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002821/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002822
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002823PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002824 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002825 const char *errors)
2826{
2827 PyUnicodeObject *v;
2828 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002829 DWORD usize;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002830
2831 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002832 assert(size < INT_MAX);
2833 usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002834 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002835 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2836
2837 v = _PyUnicode_New(usize);
2838 if (v == NULL)
2839 return NULL;
2840 if (usize == 0)
2841 return (PyObject *)v;
2842 p = PyUnicode_AS_UNICODE(v);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002843 if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002844 Py_DECREF(v);
2845 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2846 }
2847
2848 return (PyObject *)v;
2849}
2850
2851PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002852 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002853 const char *errors)
2854{
2855 PyObject *repr;
2856 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002857 DWORD mbcssize;
2858
2859 /* If there are no characters, bail now! */
2860 if (size==0)
2861 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002862
2863 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002864 assert(size<INT_MAX);
2865 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002866 if (mbcssize==0)
2867 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2868
2869 repr = PyString_FromStringAndSize(NULL, mbcssize);
2870 if (repr == NULL)
2871 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002872 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002873 return repr;
2874
2875 /* Do the conversion */
2876 s = PyString_AS_STRING(repr);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002877 assert(size < INT_MAX);
2878 if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002879 Py_DECREF(repr);
2880 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2881 }
2882 return repr;
2883}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002884
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002885PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2886{
2887 if (!PyUnicode_Check(unicode)) {
2888 PyErr_BadArgument();
2889 return NULL;
2890 }
2891 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2892 PyUnicode_GET_SIZE(unicode),
2893 NULL);
2894}
2895
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002896#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002897
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898/* --- Character Mapping Codec -------------------------------------------- */
2899
Guido van Rossumd57fd912000-03-10 22:53:23 +00002900PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002901 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902 PyObject *mapping,
2903 const char *errors)
2904{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002905 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002906 Py_ssize_t startinpos;
2907 Py_ssize_t endinpos;
2908 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002909 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910 PyUnicodeObject *v;
2911 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002912 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002913 PyObject *errorHandler = NULL;
2914 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002915 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002916 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002917
Guido van Rossumd57fd912000-03-10 22:53:23 +00002918 /* Default to Latin-1 */
2919 if (mapping == NULL)
2920 return PyUnicode_DecodeLatin1(s, size, errors);
2921
2922 v = _PyUnicode_New(size);
2923 if (v == NULL)
2924 goto onError;
2925 if (size == 0)
2926 return (PyObject *)v;
2927 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002928 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002929 if (PyUnicode_CheckExact(mapping)) {
2930 mapstring = PyUnicode_AS_UNICODE(mapping);
2931 maplen = PyUnicode_GET_SIZE(mapping);
2932 while (s < e) {
2933 unsigned char ch = *s;
2934 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002936 if (ch < maplen)
2937 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002938
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002939 if (x == 0xfffe) {
2940 /* undefined mapping */
2941 outpos = p-PyUnicode_AS_UNICODE(v);
2942 startinpos = s-starts;
2943 endinpos = startinpos+1;
2944 if (unicode_decode_call_errorhandler(
2945 errors, &errorHandler,
2946 "charmap", "character maps to <undefined>",
2947 starts, size, &startinpos, &endinpos, &exc, &s,
2948 (PyObject **)&v, &outpos, &p)) {
2949 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002950 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002951 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002952 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002953 *p++ = x;
2954 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002956 }
2957 else {
2958 while (s < e) {
2959 unsigned char ch = *s;
2960 PyObject *w, *x;
2961
2962 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2963 w = PyInt_FromLong((long)ch);
2964 if (w == NULL)
2965 goto onError;
2966 x = PyObject_GetItem(mapping, w);
2967 Py_DECREF(w);
2968 if (x == NULL) {
2969 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2970 /* No mapping found means: mapping is undefined. */
2971 PyErr_Clear();
2972 x = Py_None;
2973 Py_INCREF(x);
2974 } else
2975 goto onError;
2976 }
2977
2978 /* Apply mapping */
2979 if (PyInt_Check(x)) {
2980 long value = PyInt_AS_LONG(x);
2981 if (value < 0 || value > 65535) {
2982 PyErr_SetString(PyExc_TypeError,
2983 "character mapping must be in range(65536)");
2984 Py_DECREF(x);
2985 goto onError;
2986 }
2987 *p++ = (Py_UNICODE)value;
2988 }
2989 else if (x == Py_None) {
2990 /* undefined mapping */
2991 outpos = p-PyUnicode_AS_UNICODE(v);
2992 startinpos = s-starts;
2993 endinpos = startinpos+1;
2994 if (unicode_decode_call_errorhandler(
2995 errors, &errorHandler,
2996 "charmap", "character maps to <undefined>",
2997 starts, size, &startinpos, &endinpos, &exc, &s,
2998 (PyObject **)&v, &outpos, &p)) {
2999 Py_DECREF(x);
3000 goto onError;
3001 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003002 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003003 continue;
3004 }
3005 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003006 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003007
3008 if (targetsize == 1)
3009 /* 1-1 mapping */
3010 *p++ = *PyUnicode_AS_UNICODE(x);
3011
3012 else if (targetsize > 1) {
3013 /* 1-n mapping */
3014 if (targetsize > extrachars) {
3015 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003016 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3017 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003018 (targetsize << 2);
3019 extrachars += needed;
3020 if (_PyUnicode_Resize(&v,
3021 PyUnicode_GET_SIZE(v) + needed) < 0) {
3022 Py_DECREF(x);
3023 goto onError;
3024 }
3025 p = PyUnicode_AS_UNICODE(v) + oldpos;
3026 }
3027 Py_UNICODE_COPY(p,
3028 PyUnicode_AS_UNICODE(x),
3029 targetsize);
3030 p += targetsize;
3031 extrachars -= targetsize;
3032 }
3033 /* 1-0 mapping: skip the character */
3034 }
3035 else {
3036 /* wrong return value */
3037 PyErr_SetString(PyExc_TypeError,
3038 "character mapping must return integer, None or unicode");
3039 Py_DECREF(x);
3040 goto onError;
3041 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003043 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045 }
3046 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003047 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003049 Py_XDECREF(errorHandler);
3050 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003052
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003054 Py_XDECREF(errorHandler);
3055 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 Py_XDECREF(v);
3057 return NULL;
3058}
3059
Martin v. Löwis3f767792006-06-04 19:36:28 +00003060/* Charmap encoding: the lookup table */
3061
3062struct encoding_map{
3063 PyObject_HEAD
3064 unsigned char level1[32];
3065 int count2, count3;
3066 unsigned char level23[1];
3067};
3068
3069static PyObject*
3070encoding_map_size(PyObject *obj, PyObject* args)
3071{
3072 struct encoding_map *map = (struct encoding_map*)obj;
3073 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3074 128*map->count3);
3075}
3076
3077static PyMethodDef encoding_map_methods[] = {
3078 {"size", encoding_map_size, METH_NOARGS,
3079 PyDoc_STR("Return the size (in bytes) of this object") },
3080 { 0 }
3081};
3082
3083static void
3084encoding_map_dealloc(PyObject* o)
3085{
3086 PyObject_FREE(o);
3087}
3088
3089static PyTypeObject EncodingMapType = {
3090 PyObject_HEAD_INIT(NULL)
3091 0, /*ob_size*/
3092 "EncodingMap", /*tp_name*/
3093 sizeof(struct encoding_map), /*tp_basicsize*/
3094 0, /*tp_itemsize*/
3095 /* methods */
3096 encoding_map_dealloc, /*tp_dealloc*/
3097 0, /*tp_print*/
3098 0, /*tp_getattr*/
3099 0, /*tp_setattr*/
3100 0, /*tp_compare*/
3101 0, /*tp_repr*/
3102 0, /*tp_as_number*/
3103 0, /*tp_as_sequence*/
3104 0, /*tp_as_mapping*/
3105 0, /*tp_hash*/
3106 0, /*tp_call*/
3107 0, /*tp_str*/
3108 0, /*tp_getattro*/
3109 0, /*tp_setattro*/
3110 0, /*tp_as_buffer*/
3111 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3112 0, /*tp_doc*/
3113 0, /*tp_traverse*/
3114 0, /*tp_clear*/
3115 0, /*tp_richcompare*/
3116 0, /*tp_weaklistoffset*/
3117 0, /*tp_iter*/
3118 0, /*tp_iternext*/
3119 encoding_map_methods, /*tp_methods*/
3120 0, /*tp_members*/
3121 0, /*tp_getset*/
3122 0, /*tp_base*/
3123 0, /*tp_dict*/
3124 0, /*tp_descr_get*/
3125 0, /*tp_descr_set*/
3126 0, /*tp_dictoffset*/
3127 0, /*tp_init*/
3128 0, /*tp_alloc*/
3129 0, /*tp_new*/
3130 0, /*tp_free*/
3131 0, /*tp_is_gc*/
3132};
3133
3134PyObject*
3135PyUnicode_BuildEncodingMap(PyObject* string)
3136{
3137 Py_UNICODE *decode;
3138 PyObject *result;
3139 struct encoding_map *mresult;
3140 int i;
3141 int need_dict = 0;
3142 unsigned char level1[32];
3143 unsigned char level2[512];
3144 unsigned char *mlevel1, *mlevel2, *mlevel3;
3145 int count2 = 0, count3 = 0;
3146
3147 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3148 PyErr_BadArgument();
3149 return NULL;
3150 }
3151 decode = PyUnicode_AS_UNICODE(string);
3152 memset(level1, 0xFF, sizeof level1);
3153 memset(level2, 0xFF, sizeof level2);
3154
3155 /* If there isn't a one-to-one mapping of NULL to \0,
3156 or if there are non-BMP characters, we need to use
3157 a mapping dictionary. */
3158 if (decode[0] != 0)
3159 need_dict = 1;
3160 for (i = 1; i < 256; i++) {
3161 int l1, l2;
3162 if (decode[i] == 0
3163 #ifdef Py_UNICODE_WIDE
3164 || decode[i] > 0xFFFF
3165 #endif
3166 ) {
3167 need_dict = 1;
3168 break;
3169 }
3170 if (decode[i] == 0xFFFE)
3171 /* unmapped character */
3172 continue;
3173 l1 = decode[i] >> 11;
3174 l2 = decode[i] >> 7;
3175 if (level1[l1] == 0xFF)
3176 level1[l1] = count2++;
3177 if (level2[l2] == 0xFF)
3178 level2[l2] = count3++;
3179 }
3180
3181 if (count2 >= 0xFF || count3 >= 0xFF)
3182 need_dict = 1;
3183
3184 if (need_dict) {
3185 PyObject *result = PyDict_New();
3186 PyObject *key, *value;
3187 if (!result)
3188 return NULL;
3189 for (i = 0; i < 256; i++) {
3190 key = value = NULL;
3191 key = PyInt_FromLong(decode[i]);
3192 value = PyInt_FromLong(i);
3193 if (!key || !value)
3194 goto failed1;
3195 if (PyDict_SetItem(result, key, value) == -1)
3196 goto failed1;
3197 }
3198 return result;
3199 failed1:
3200 Py_XDECREF(key);
3201 Py_XDECREF(value);
3202 Py_DECREF(result);
3203 return NULL;
3204 }
3205
3206 /* Create a three-level trie */
3207 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3208 16*count2 + 128*count3 - 1);
3209 if (!result)
3210 return PyErr_NoMemory();
3211 PyObject_Init(result, &EncodingMapType);
3212 mresult = (struct encoding_map*)result;
3213 mresult->count2 = count2;
3214 mresult->count3 = count3;
3215 mlevel1 = mresult->level1;
3216 mlevel2 = mresult->level23;
3217 mlevel3 = mresult->level23 + 16*count2;
3218 memcpy(mlevel1, level1, 32);
3219 memset(mlevel2, 0xFF, 16*count2);
3220 memset(mlevel3, 0, 128*count3);
3221 count3 = 0;
3222 for (i = 1; i < 256; i++) {
3223 int o1, o2, o3, i2, i3;
3224 if (decode[i] == 0xFFFE)
3225 /* unmapped character */
3226 continue;
3227 o1 = decode[i]>>11;
3228 o2 = (decode[i]>>7) & 0xF;
3229 i2 = 16*mlevel1[o1] + o2;
3230 if (mlevel2[i2] == 0xFF)
3231 mlevel2[i2] = count3++;
3232 o3 = decode[i] & 0x7F;
3233 i3 = 128*mlevel2[i2] + o3;
3234 mlevel3[i3] = i;
3235 }
3236 return result;
3237}
3238
3239static int
3240encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3241{
3242 struct encoding_map *map = (struct encoding_map*)mapping;
3243 int l1 = c>>11;
3244 int l2 = (c>>7) & 0xF;
3245 int l3 = c & 0x7F;
3246 int i;
3247
3248#ifdef Py_UNICODE_WIDE
3249 if (c > 0xFFFF) {
3250 return -1;
3251 }
3252#endif
3253 if (c == 0)
3254 return 0;
3255 /* level 1*/
3256 i = map->level1[l1];
3257 if (i == 0xFF) {
3258 return -1;
3259 }
3260 /* level 2*/
3261 i = map->level23[16*i+l2];
3262 if (i == 0xFF) {
3263 return -1;
3264 }
3265 /* level 3 */
3266 i = map->level23[16*map->count2 + 128*i + l3];
3267 if (i == 0) {
3268 return -1;
3269 }
3270 return i;
3271}
3272
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003273/* Lookup the character ch in the mapping. If the character
3274 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003275 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003276static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003277{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003278 PyObject *w = PyInt_FromLong((long)c);
3279 PyObject *x;
3280
3281 if (w == NULL)
3282 return NULL;
3283 x = PyObject_GetItem(mapping, w);
3284 Py_DECREF(w);
3285 if (x == NULL) {
3286 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3287 /* No mapping found means: mapping is undefined. */
3288 PyErr_Clear();
3289 x = Py_None;
3290 Py_INCREF(x);
3291 return x;
3292 } else
3293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003295 else if (x == Py_None)
3296 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003297 else if (PyInt_Check(x)) {
3298 long value = PyInt_AS_LONG(x);
3299 if (value < 0 || value > 255) {
3300 PyErr_SetString(PyExc_TypeError,
3301 "character mapping must be in range(256)");
3302 Py_DECREF(x);
3303 return NULL;
3304 }
3305 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003307 else if (PyString_Check(x))
3308 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003310 /* wrong return value */
3311 PyErr_SetString(PyExc_TypeError,
3312 "character mapping must return integer, None or str");
3313 Py_DECREF(x);
3314 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 }
3316}
3317
Martin v. Löwis3f767792006-06-04 19:36:28 +00003318static int
3319charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3320{
3321 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3322 /* exponentially overallocate to minimize reallocations */
3323 if (requiredsize < 2*outsize)
3324 requiredsize = 2*outsize;
3325 if (_PyString_Resize(outobj, requiredsize)) {
3326 return 0;
3327 }
3328 return 1;
3329}
3330
3331typedef enum charmapencode_result {
3332 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3333}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003334/* lookup the character, put the result in the output string and adjust
3335 various state variables. Reallocate the output string if not enough
3336 space is available. Return a new reference to the object that
3337 was put in the output buffer, or Py_None, if the mapping was undefined
3338 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003339 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003340static
Martin v. Löwis3f767792006-06-04 19:36:28 +00003341charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003342 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343{
Martin v. Löwis3f767792006-06-04 19:36:28 +00003344 PyObject *rep;
3345 char *outstart;
3346 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003347
Martin v. Löwis3f767792006-06-04 19:36:28 +00003348 if (mapping->ob_type == &EncodingMapType) {
3349 int res = encoding_map_lookup(c, mapping);
3350 Py_ssize_t requiredsize = *outpos+1;
3351 if (res == -1)
3352 return enc_FAILED;
3353 if (outsize<requiredsize)
3354 if (!charmapencode_resize(outobj, outpos, requiredsize))
3355 return enc_EXCEPTION;
3356 outstart = PyString_AS_STRING(*outobj);
3357 outstart[(*outpos)++] = (char)res;
3358 return enc_SUCCESS;
3359 }
3360
3361 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003362 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003363 return enc_EXCEPTION;
3364 else if (rep==Py_None) {
3365 Py_DECREF(rep);
3366 return enc_FAILED;
3367 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003368 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003369 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003370 if (outsize<requiredsize)
3371 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003372 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003373 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003374 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003375 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003376 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3377 }
3378 else {
3379 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003380 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3381 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003382 if (outsize<requiredsize)
3383 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003384 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003385 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003386 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003387 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003388 memcpy(outstart + *outpos, repchars, repsize);
3389 *outpos += repsize;
3390 }
3391 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003392 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003393}
3394
3395/* handle an error in PyUnicode_EncodeCharmap
3396 Return 0 on success, -1 on error */
3397static
3398int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003399 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003400 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003401 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003402 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003403{
3404 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003405 Py_ssize_t repsize;
3406 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003407 Py_UNICODE *uni2;
3408 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003409 Py_ssize_t collstartpos = *inpos;
3410 Py_ssize_t collendpos = *inpos+1;
3411 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003412 char *encoding = "charmap";
3413 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00003414 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003415
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003416 /* find all unencodable characters */
3417 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003418 PyObject *rep;
3419 if (mapping->ob_type == &EncodingMapType) {
3420 int res = encoding_map_lookup(p[collendpos], mapping);
3421 if (res != -1)
3422 break;
3423 ++collendpos;
3424 continue;
3425 }
3426
3427 rep = charmapencode_lookup(p[collendpos], mapping);
3428 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003430 else if (rep!=Py_None) {
3431 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003432 break;
3433 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003434 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003435 ++collendpos;
3436 }
3437 /* cache callback name lookup
3438 * (if not done yet, i.e. it's the first error) */
3439 if (*known_errorHandler==-1) {
3440 if ((errors==NULL) || (!strcmp(errors, "strict")))
3441 *known_errorHandler = 1;
3442 else if (!strcmp(errors, "replace"))
3443 *known_errorHandler = 2;
3444 else if (!strcmp(errors, "ignore"))
3445 *known_errorHandler = 3;
3446 else if (!strcmp(errors, "xmlcharrefreplace"))
3447 *known_errorHandler = 4;
3448 else
3449 *known_errorHandler = 0;
3450 }
3451 switch (*known_errorHandler) {
3452 case 1: /* strict */
3453 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3454 return -1;
3455 case 2: /* replace */
3456 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3457 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003458 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003459 return -1;
3460 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003461 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003462 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3463 return -1;
3464 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003465 }
3466 /* fall through */
3467 case 3: /* ignore */
3468 *inpos = collendpos;
3469 break;
3470 case 4: /* xmlcharrefreplace */
3471 /* generate replacement (temporarily (mis)uses p) */
3472 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3473 char buffer[2+29+1+1];
3474 char *cp;
3475 sprintf(buffer, "&#%d;", (int)p[collpos]);
3476 for (cp = buffer; *cp; ++cp) {
3477 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003478 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003480 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003481 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3482 return -1;
3483 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003484 }
3485 }
3486 *inpos = collendpos;
3487 break;
3488 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003489 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003490 encoding, reason, p, size, exceptionObject,
3491 collstartpos, collendpos, &newpos);
3492 if (repunicode == NULL)
3493 return -1;
3494 /* generate replacement */
3495 repsize = PyUnicode_GET_SIZE(repunicode);
3496 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3497 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003498 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003499 return -1;
3500 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003501 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3504 return -1;
3505 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506 }
3507 *inpos = newpos;
3508 Py_DECREF(repunicode);
3509 }
3510 return 0;
3511}
3512
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003514 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515 PyObject *mapping,
3516 const char *errors)
3517{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 /* output object */
3519 PyObject *res = NULL;
3520 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003521 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003523 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003524 PyObject *errorHandler = NULL;
3525 PyObject *exc = NULL;
3526 /* the following variable is used for caching string comparisons
3527 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3528 * 3=ignore, 4=xmlcharrefreplace */
3529 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530
3531 /* Default to Latin-1 */
3532 if (mapping == NULL)
3533 return PyUnicode_EncodeLatin1(p, size, errors);
3534
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 /* allocate enough for a simple encoding without
3536 replacements, if we need more, we'll resize */
3537 res = PyString_FromStringAndSize(NULL, size);
3538 if (res == NULL)
3539 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003540 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 while (inpos<size) {
3544 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00003545 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3546 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003548 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003549 if (charmap_encoding_error(p, size, &inpos, mapping,
3550 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003551 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003552 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003553 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556 else
3557 /* done with this character => adjust input position */
3558 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003560
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561 /* Resize if we allocated to much */
3562 if (respos<PyString_GET_SIZE(res)) {
3563 if (_PyString_Resize(&res, respos))
3564 goto onError;
3565 }
3566 Py_XDECREF(exc);
3567 Py_XDECREF(errorHandler);
3568 return res;
3569
3570 onError:
3571 Py_XDECREF(res);
3572 Py_XDECREF(exc);
3573 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003574 return NULL;
3575}
3576
3577PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3578 PyObject *mapping)
3579{
3580 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3581 PyErr_BadArgument();
3582 return NULL;
3583 }
3584 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3585 PyUnicode_GET_SIZE(unicode),
3586 mapping,
3587 NULL);
3588}
3589
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590/* create or adjust a UnicodeTranslateError */
3591static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003592 const Py_UNICODE *unicode, Py_ssize_t size,
3593 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003595{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003596 if (*exceptionObject == NULL) {
3597 *exceptionObject = PyUnicodeTranslateError_Create(
3598 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599 }
3600 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3602 goto onError;
3603 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3604 goto onError;
3605 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3606 goto onError;
3607 return;
3608 onError:
3609 Py_DECREF(*exceptionObject);
3610 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611 }
3612}
3613
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614/* raises a UnicodeTranslateError */
3615static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003616 const Py_UNICODE *unicode, Py_ssize_t size,
3617 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003618 const char *reason)
3619{
3620 make_translate_exception(exceptionObject,
3621 unicode, size, startpos, endpos, reason);
3622 if (*exceptionObject != NULL)
3623 PyCodec_StrictErrors(*exceptionObject);
3624}
3625
3626/* error handling callback helper:
3627 build arguments, call the callback and check the arguments,
3628 put the result into newpos and return the replacement string, which
3629 has to be freed by the caller */
3630static PyObject *unicode_translate_call_errorhandler(const char *errors,
3631 PyObject **errorHandler,
3632 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003633 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3634 Py_ssize_t startpos, Py_ssize_t endpos,
3635 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003636{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003637 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003638
Martin v. Löwis412fb672006-04-13 06:34:32 +00003639 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640 PyObject *restuple;
3641 PyObject *resunicode;
3642
3643 if (*errorHandler == NULL) {
3644 *errorHandler = PyCodec_LookupError(errors);
3645 if (*errorHandler == NULL)
3646 return NULL;
3647 }
3648
3649 make_translate_exception(exceptionObject,
3650 unicode, size, startpos, endpos, reason);
3651 if (*exceptionObject == NULL)
3652 return NULL;
3653
3654 restuple = PyObject_CallFunctionObjArgs(
3655 *errorHandler, *exceptionObject, NULL);
3656 if (restuple == NULL)
3657 return NULL;
3658 if (!PyTuple_Check(restuple)) {
3659 PyErr_Format(PyExc_TypeError, &argparse[4]);
3660 Py_DECREF(restuple);
3661 return NULL;
3662 }
3663 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003664 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003665 Py_DECREF(restuple);
3666 return NULL;
3667 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003668 if (i_newpos<0)
3669 *newpos = size+i_newpos;
3670 else
3671 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003672 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003673 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003674 Py_DECREF(restuple);
3675 return NULL;
3676 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003677 Py_INCREF(resunicode);
3678 Py_DECREF(restuple);
3679 return resunicode;
3680}
3681
3682/* Lookup the character ch in the mapping and put the result in result,
3683 which must be decrefed by the caller.
3684 Return 0 on success, -1 on error */
3685static
3686int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3687{
3688 PyObject *w = PyInt_FromLong((long)c);
3689 PyObject *x;
3690
3691 if (w == NULL)
3692 return -1;
3693 x = PyObject_GetItem(mapping, w);
3694 Py_DECREF(w);
3695 if (x == NULL) {
3696 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3697 /* No mapping found means: use 1:1 mapping. */
3698 PyErr_Clear();
3699 *result = NULL;
3700 return 0;
3701 } else
3702 return -1;
3703 }
3704 else if (x == Py_None) {
3705 *result = x;
3706 return 0;
3707 }
3708 else if (PyInt_Check(x)) {
3709 long value = PyInt_AS_LONG(x);
3710 long max = PyUnicode_GetMax();
3711 if (value < 0 || value > max) {
3712 PyErr_Format(PyExc_TypeError,
3713 "character mapping must be in range(0x%lx)", max+1);
3714 Py_DECREF(x);
3715 return -1;
3716 }
3717 *result = x;
3718 return 0;
3719 }
3720 else if (PyUnicode_Check(x)) {
3721 *result = x;
3722 return 0;
3723 }
3724 else {
3725 /* wrong return value */
3726 PyErr_SetString(PyExc_TypeError,
3727 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003728 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003729 return -1;
3730 }
3731}
3732/* ensure that *outobj is at least requiredsize characters long,
3733if not reallocate and adjust various state variables.
3734Return 0 on success, -1 on error */
3735static
Walter Dörwald4894c302003-10-24 14:25:28 +00003736int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003737 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003738{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003739 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003740 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003741 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003742 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003743 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003744 if (requiredsize < 2 * oldsize)
3745 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003746 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003747 return -1;
3748 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003749 }
3750 return 0;
3751}
3752/* lookup the character, put the result in the output string and adjust
3753 various state variables. Return a new reference to the object that
3754 was put in the output buffer in *result, or Py_None, if the mapping was
3755 undefined (in which case no character was written).
3756 The called must decref result.
3757 Return 0 on success, -1 on error. */
3758static
Walter Dörwald4894c302003-10-24 14:25:28 +00003759int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003760 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003761 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003762{
Walter Dörwald4894c302003-10-24 14:25:28 +00003763 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 return -1;
3765 if (*res==NULL) {
3766 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003767 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003768 }
3769 else if (*res==Py_None)
3770 ;
3771 else if (PyInt_Check(*res)) {
3772 /* no overflow check, because we know that the space is enough */
3773 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3774 }
3775 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003776 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003777 if (repsize==1) {
3778 /* no overflow check, because we know that the space is enough */
3779 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3780 }
3781 else if (repsize!=0) {
3782 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003783 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003784 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003785 repsize - 1;
3786 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 return -1;
3788 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3789 *outp += repsize;
3790 }
3791 }
3792 else
3793 return -1;
3794 return 0;
3795}
3796
3797PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003798 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799 PyObject *mapping,
3800 const char *errors)
3801{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003802 /* output object */
3803 PyObject *res = NULL;
3804 /* pointers to the beginning and end+1 of input */
3805 const Py_UNICODE *startp = p;
3806 const Py_UNICODE *endp = p + size;
3807 /* pointer into the output */
3808 Py_UNICODE *str;
3809 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003810 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003811 char *reason = "character maps to <undefined>";
3812 PyObject *errorHandler = NULL;
3813 PyObject *exc = NULL;
3814 /* the following variable is used for caching string comparisons
3815 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3816 * 3=ignore, 4=xmlcharrefreplace */
3817 int known_errorHandler = -1;
3818
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819 if (mapping == NULL) {
3820 PyErr_BadArgument();
3821 return NULL;
3822 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003823
3824 /* allocate enough for a simple 1:1 translation without
3825 replacements, if we need more, we'll resize */
3826 res = PyUnicode_FromUnicode(NULL, size);
3827 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003828 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003830 return res;
3831 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003833 while (p<endp) {
3834 /* try to encode it */
3835 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003836 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003837 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 goto onError;
3839 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003840 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003841 if (x!=Py_None) /* it worked => adjust input pointer */
3842 ++p;
3843 else { /* untranslatable character */
3844 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003845 Py_ssize_t repsize;
3846 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003847 Py_UNICODE *uni2;
3848 /* startpos for collecting untranslatable chars */
3849 const Py_UNICODE *collstart = p;
3850 const Py_UNICODE *collend = p+1;
3851 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003852
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003853 /* find all untranslatable characters */
3854 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003855 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003856 goto onError;
3857 Py_XDECREF(x);
3858 if (x!=Py_None)
3859 break;
3860 ++collend;
3861 }
3862 /* cache callback name lookup
3863 * (if not done yet, i.e. it's the first error) */
3864 if (known_errorHandler==-1) {
3865 if ((errors==NULL) || (!strcmp(errors, "strict")))
3866 known_errorHandler = 1;
3867 else if (!strcmp(errors, "replace"))
3868 known_errorHandler = 2;
3869 else if (!strcmp(errors, "ignore"))
3870 known_errorHandler = 3;
3871 else if (!strcmp(errors, "xmlcharrefreplace"))
3872 known_errorHandler = 4;
3873 else
3874 known_errorHandler = 0;
3875 }
3876 switch (known_errorHandler) {
3877 case 1: /* strict */
3878 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3879 goto onError;
3880 case 2: /* replace */
3881 /* No need to check for space, this is a 1:1 replacement */
3882 for (coll = collstart; coll<collend; ++coll)
3883 *str++ = '?';
3884 /* fall through */
3885 case 3: /* ignore */
3886 p = collend;
3887 break;
3888 case 4: /* xmlcharrefreplace */
3889 /* generate replacement (temporarily (mis)uses p) */
3890 for (p = collstart; p < collend; ++p) {
3891 char buffer[2+29+1+1];
3892 char *cp;
3893 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003894 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003895 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3896 goto onError;
3897 for (cp = buffer; *cp; ++cp)
3898 *str++ = *cp;
3899 }
3900 p = collend;
3901 break;
3902 default:
3903 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3904 reason, startp, size, &exc,
3905 collstart-startp, collend-startp, &newpos);
3906 if (repunicode == NULL)
3907 goto onError;
3908 /* generate replacement */
3909 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003910 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003911 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3912 Py_DECREF(repunicode);
3913 goto onError;
3914 }
3915 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3916 *str++ = *uni2;
3917 p = startp + newpos;
3918 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003919 }
3920 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003922 /* Resize if we allocated to much */
3923 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003924 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003925 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003926 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003927 }
3928 Py_XDECREF(exc);
3929 Py_XDECREF(errorHandler);
3930 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003931
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003932 onError:
3933 Py_XDECREF(res);
3934 Py_XDECREF(exc);
3935 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936 return NULL;
3937}
3938
3939PyObject *PyUnicode_Translate(PyObject *str,
3940 PyObject *mapping,
3941 const char *errors)
3942{
3943 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003944
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945 str = PyUnicode_FromObject(str);
3946 if (str == NULL)
3947 goto onError;
3948 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3949 PyUnicode_GET_SIZE(str),
3950 mapping,
3951 errors);
3952 Py_DECREF(str);
3953 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003954
Guido van Rossumd57fd912000-03-10 22:53:23 +00003955 onError:
3956 Py_XDECREF(str);
3957 return NULL;
3958}
Tim Petersced69f82003-09-16 20:30:58 +00003959
Guido van Rossum9e896b32000-04-05 20:11:21 +00003960/* --- Decimal Encoder ---------------------------------------------------- */
3961
3962int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003963 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00003964 char *output,
3965 const char *errors)
3966{
3967 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003968 PyObject *errorHandler = NULL;
3969 PyObject *exc = NULL;
3970 const char *encoding = "decimal";
3971 const char *reason = "invalid decimal Unicode string";
3972 /* the following variable is used for caching string comparisons
3973 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3974 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003975
3976 if (output == NULL) {
3977 PyErr_BadArgument();
3978 return -1;
3979 }
3980
3981 p = s;
3982 end = s + length;
3983 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003985 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003986 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003987 Py_ssize_t repsize;
3988 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003989 Py_UNICODE *uni2;
3990 Py_UNICODE *collstart;
3991 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003992
Guido van Rossum9e896b32000-04-05 20:11:21 +00003993 if (Py_UNICODE_ISSPACE(ch)) {
3994 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003995 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003996 continue;
3997 }
3998 decimal = Py_UNICODE_TODECIMAL(ch);
3999 if (decimal >= 0) {
4000 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004001 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004002 continue;
4003 }
Guido van Rossumba477042000-04-06 18:18:10 +00004004 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004005 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004006 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004007 continue;
4008 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004009 /* All other characters are considered unencodable */
4010 collstart = p;
4011 collend = p+1;
4012 while (collend < end) {
4013 if ((0 < *collend && *collend < 256) ||
4014 !Py_UNICODE_ISSPACE(*collend) ||
4015 Py_UNICODE_TODECIMAL(*collend))
4016 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004017 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004018 /* cache callback name lookup
4019 * (if not done yet, i.e. it's the first error) */
4020 if (known_errorHandler==-1) {
4021 if ((errors==NULL) || (!strcmp(errors, "strict")))
4022 known_errorHandler = 1;
4023 else if (!strcmp(errors, "replace"))
4024 known_errorHandler = 2;
4025 else if (!strcmp(errors, "ignore"))
4026 known_errorHandler = 3;
4027 else if (!strcmp(errors, "xmlcharrefreplace"))
4028 known_errorHandler = 4;
4029 else
4030 known_errorHandler = 0;
4031 }
4032 switch (known_errorHandler) {
4033 case 1: /* strict */
4034 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4035 goto onError;
4036 case 2: /* replace */
4037 for (p = collstart; p < collend; ++p)
4038 *output++ = '?';
4039 /* fall through */
4040 case 3: /* ignore */
4041 p = collend;
4042 break;
4043 case 4: /* xmlcharrefreplace */
4044 /* generate replacement (temporarily (mis)uses p) */
4045 for (p = collstart; p < collend; ++p)
4046 output += sprintf(output, "&#%d;", (int)*p);
4047 p = collend;
4048 break;
4049 default:
4050 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4051 encoding, reason, s, length, &exc,
4052 collstart-s, collend-s, &newpos);
4053 if (repunicode == NULL)
4054 goto onError;
4055 /* generate replacement */
4056 repsize = PyUnicode_GET_SIZE(repunicode);
4057 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4058 Py_UNICODE ch = *uni2;
4059 if (Py_UNICODE_ISSPACE(ch))
4060 *output++ = ' ';
4061 else {
4062 decimal = Py_UNICODE_TODECIMAL(ch);
4063 if (decimal >= 0)
4064 *output++ = '0' + decimal;
4065 else if (0 < ch && ch < 256)
4066 *output++ = (char)ch;
4067 else {
4068 Py_DECREF(repunicode);
4069 raise_encode_exception(&exc, encoding,
4070 s, length, collstart-s, collend-s, reason);
4071 goto onError;
4072 }
4073 }
4074 }
4075 p = s + newpos;
4076 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004077 }
4078 }
4079 /* 0-terminate the output string */
4080 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004081 Py_XDECREF(exc);
4082 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004083 return 0;
4084
4085 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004086 Py_XDECREF(exc);
4087 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004088 return -1;
4089}
4090
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091/* --- Helpers ------------------------------------------------------------ */
4092
Fredrik Lundha50d2012006-05-26 17:04:58 +00004093#define STRINGLIB_CHAR Py_UNICODE
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004094
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004095#define STRINGLIB_LEN PyUnicode_GET_SIZE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004096#define STRINGLIB_NEW PyUnicode_FromUnicode
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004097#define STRINGLIB_STR PyUnicode_AS_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004098
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004099Py_LOCAL_INLINE(int)
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004100STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4101{
Fredrik Lundh9c0e9c02006-05-26 18:24:15 +00004102 if (str[0] != other[0])
4103 return 1;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004104 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4105}
4106
Fredrik Lundhb9479482006-05-26 17:22:38 +00004107#define STRINGLIB_EMPTY unicode_empty
4108
Fredrik Lundha50d2012006-05-26 17:04:58 +00004109#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004110
4111#include "stringlib/count.h"
4112#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00004113#include "stringlib/partition.h"
4114
Fredrik Lundhc8162812006-05-26 19:33:03 +00004115/* helper macro to fixup start/end slice values */
4116#define FIX_START_END(obj) \
4117 if (start < 0) \
4118 start += (obj)->length; \
4119 if (start < 0) \
4120 start = 0; \
4121 if (end > (obj)->length) \
4122 end = (obj)->length; \
4123 if (end < 0) \
4124 end += (obj)->length; \
4125 if (end < 0) \
4126 end = 0;
4127
Martin v. Löwis18e16552006-02-15 17:27:45 +00004128Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004129 PyObject *substr,
4130 Py_ssize_t start,
4131 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004133 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004134 PyUnicodeObject* str_obj;
4135 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004136
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004137 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4138 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004140 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4141 if (!sub_obj) {
4142 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143 return -1;
4144 }
Tim Petersced69f82003-09-16 20:30:58 +00004145
Fredrik Lundhc8162812006-05-26 19:33:03 +00004146 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004147
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004148 result = stringlib_count(
4149 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4150 );
4151
4152 Py_DECREF(sub_obj);
4153 Py_DECREF(str_obj);
4154
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155 return result;
4156}
4157
Martin v. Löwis18e16552006-02-15 17:27:45 +00004158Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004159 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004160 Py_ssize_t start,
4161 Py_ssize_t end,
4162 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004163{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004164 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004165
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004166 str = PyUnicode_FromObject(str);
4167 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004168 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004169 sub = PyUnicode_FromObject(sub);
4170 if (!sub) {
4171 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004172 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173 }
Tim Petersced69f82003-09-16 20:30:58 +00004174
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004175 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004176 result = stringlib_find_slice(
4177 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4178 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4179 start, end
4180 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004181 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004182 result = stringlib_rfind_slice(
4183 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4184 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4185 start, end
4186 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004187
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004188 Py_DECREF(str);
4189 Py_DECREF(sub);
4190
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191 return result;
4192}
4193
Tim Petersced69f82003-09-16 20:30:58 +00004194static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004195int tailmatch(PyUnicodeObject *self,
4196 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004197 Py_ssize_t start,
4198 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 int direction)
4200{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201 if (substring->length == 0)
4202 return 1;
4203
Fredrik Lundhc8162812006-05-26 19:33:03 +00004204 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205
4206 end -= substring->length;
4207 if (end < start)
4208 return 0;
4209
4210 if (direction > 0) {
4211 if (Py_UNICODE_MATCH(self, end, substring))
4212 return 1;
4213 } else {
4214 if (Py_UNICODE_MATCH(self, start, substring))
4215 return 1;
4216 }
4217
4218 return 0;
4219}
4220
Martin v. Löwis18e16552006-02-15 17:27:45 +00004221Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004222 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004223 Py_ssize_t start,
4224 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004225 int direction)
4226{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004227 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004228
Guido van Rossumd57fd912000-03-10 22:53:23 +00004229 str = PyUnicode_FromObject(str);
4230 if (str == NULL)
4231 return -1;
4232 substr = PyUnicode_FromObject(substr);
4233 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004234 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235 return -1;
4236 }
Tim Petersced69f82003-09-16 20:30:58 +00004237
Guido van Rossumd57fd912000-03-10 22:53:23 +00004238 result = tailmatch((PyUnicodeObject *)str,
4239 (PyUnicodeObject *)substr,
4240 start, end, direction);
4241 Py_DECREF(str);
4242 Py_DECREF(substr);
4243 return result;
4244}
4245
Guido van Rossumd57fd912000-03-10 22:53:23 +00004246/* Apply fixfct filter to the Unicode object self and return a
4247 reference to the modified object */
4248
Tim Petersced69f82003-09-16 20:30:58 +00004249static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004250PyObject *fixup(PyUnicodeObject *self,
4251 int (*fixfct)(PyUnicodeObject *s))
4252{
4253
4254 PyUnicodeObject *u;
4255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004256 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257 if (u == NULL)
4258 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004259
4260 Py_UNICODE_COPY(u->str, self->str, self->length);
4261
Tim Peters7a29bd52001-09-12 03:03:31 +00004262 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004263 /* fixfct should return TRUE if it modified the buffer. If
4264 FALSE, return a reference to the original buffer instead
4265 (to save space, not time) */
4266 Py_INCREF(self);
4267 Py_DECREF(u);
4268 return (PyObject*) self;
4269 }
4270 return (PyObject*) u;
4271}
4272
Tim Petersced69f82003-09-16 20:30:58 +00004273static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004274int fixupper(PyUnicodeObject *self)
4275{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004276 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277 Py_UNICODE *s = self->str;
4278 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004279
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280 while (len-- > 0) {
4281 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004282
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283 ch = Py_UNICODE_TOUPPER(*s);
4284 if (ch != *s) {
4285 status = 1;
4286 *s = ch;
4287 }
4288 s++;
4289 }
4290
4291 return status;
4292}
4293
Tim Petersced69f82003-09-16 20:30:58 +00004294static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004295int fixlower(PyUnicodeObject *self)
4296{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004297 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004298 Py_UNICODE *s = self->str;
4299 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004300
Guido van Rossumd57fd912000-03-10 22:53:23 +00004301 while (len-- > 0) {
4302 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004303
Guido van Rossumd57fd912000-03-10 22:53:23 +00004304 ch = Py_UNICODE_TOLOWER(*s);
4305 if (ch != *s) {
4306 status = 1;
4307 *s = ch;
4308 }
4309 s++;
4310 }
4311
4312 return status;
4313}
4314
Tim Petersced69f82003-09-16 20:30:58 +00004315static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316int fixswapcase(PyUnicodeObject *self)
4317{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004318 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319 Py_UNICODE *s = self->str;
4320 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004321
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322 while (len-- > 0) {
4323 if (Py_UNICODE_ISUPPER(*s)) {
4324 *s = Py_UNICODE_TOLOWER(*s);
4325 status = 1;
4326 } else if (Py_UNICODE_ISLOWER(*s)) {
4327 *s = Py_UNICODE_TOUPPER(*s);
4328 status = 1;
4329 }
4330 s++;
4331 }
4332
4333 return status;
4334}
4335
Tim Petersced69f82003-09-16 20:30:58 +00004336static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004337int fixcapitalize(PyUnicodeObject *self)
4338{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004339 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004340 Py_UNICODE *s = self->str;
4341 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004342
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004343 if (len == 0)
4344 return 0;
4345 if (Py_UNICODE_ISLOWER(*s)) {
4346 *s = Py_UNICODE_TOUPPER(*s);
4347 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004349 s++;
4350 while (--len > 0) {
4351 if (Py_UNICODE_ISUPPER(*s)) {
4352 *s = Py_UNICODE_TOLOWER(*s);
4353 status = 1;
4354 }
4355 s++;
4356 }
4357 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358}
4359
4360static
4361int fixtitle(PyUnicodeObject *self)
4362{
4363 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4364 register Py_UNICODE *e;
4365 int previous_is_cased;
4366
4367 /* Shortcut for single character strings */
4368 if (PyUnicode_GET_SIZE(self) == 1) {
4369 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4370 if (*p != ch) {
4371 *p = ch;
4372 return 1;
4373 }
4374 else
4375 return 0;
4376 }
Tim Petersced69f82003-09-16 20:30:58 +00004377
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378 e = p + PyUnicode_GET_SIZE(self);
4379 previous_is_cased = 0;
4380 for (; p < e; p++) {
4381 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004382
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383 if (previous_is_cased)
4384 *p = Py_UNICODE_TOLOWER(ch);
4385 else
4386 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004387
4388 if (Py_UNICODE_ISLOWER(ch) ||
4389 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390 Py_UNICODE_ISTITLE(ch))
4391 previous_is_cased = 1;
4392 else
4393 previous_is_cased = 0;
4394 }
4395 return 1;
4396}
4397
Tim Peters8ce9f162004-08-27 01:49:32 +00004398PyObject *
4399PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400{
Tim Peters8ce9f162004-08-27 01:49:32 +00004401 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004402 const Py_UNICODE blank = ' ';
4403 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004404 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004405 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004406 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4407 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004408 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4409 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004410 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004411 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004412 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413
Tim Peters05eba1f2004-08-27 21:32:02 +00004414 fseq = PySequence_Fast(seq, "");
4415 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004416 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004417 }
4418
Tim Peters91879ab2004-08-27 22:35:44 +00004419 /* Grrrr. A codec may be invoked to convert str objects to
4420 * Unicode, and so it's possible to call back into Python code
4421 * during PyUnicode_FromObject(), and so it's possible for a sick
4422 * codec to change the size of fseq (if seq is a list). Therefore
4423 * we have to keep refetching the size -- can't assume seqlen
4424 * is invariant.
4425 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004426 seqlen = PySequence_Fast_GET_SIZE(fseq);
4427 /* If empty sequence, return u"". */
4428 if (seqlen == 0) {
4429 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4430 goto Done;
4431 }
4432 /* If singleton sequence with an exact Unicode, return that. */
4433 if (seqlen == 1) {
4434 item = PySequence_Fast_GET_ITEM(fseq, 0);
4435 if (PyUnicode_CheckExact(item)) {
4436 Py_INCREF(item);
4437 res = (PyUnicodeObject *)item;
4438 goto Done;
4439 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004440 }
4441
Tim Peters05eba1f2004-08-27 21:32:02 +00004442 /* At least two items to join, or one that isn't exact Unicode. */
4443 if (seqlen > 1) {
4444 /* Set up sep and seplen -- they're needed. */
4445 if (separator == NULL) {
4446 sep = &blank;
4447 seplen = 1;
4448 }
4449 else {
4450 internal_separator = PyUnicode_FromObject(separator);
4451 if (internal_separator == NULL)
4452 goto onError;
4453 sep = PyUnicode_AS_UNICODE(internal_separator);
4454 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004455 /* In case PyUnicode_FromObject() mutated seq. */
4456 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004457 }
4458 }
4459
4460 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004461 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004462 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004463 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004464 res_p = PyUnicode_AS_UNICODE(res);
4465 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004466
Tim Peters05eba1f2004-08-27 21:32:02 +00004467 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004468 Py_ssize_t itemlen;
4469 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004470
4471 item = PySequence_Fast_GET_ITEM(fseq, i);
4472 /* Convert item to Unicode. */
4473 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4474 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004475 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004476 " %.80s found",
4477 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004478 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004479 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004480 item = PyUnicode_FromObject(item);
4481 if (item == NULL)
4482 goto onError;
4483 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004484
Tim Peters91879ab2004-08-27 22:35:44 +00004485 /* In case PyUnicode_FromObject() mutated seq. */
4486 seqlen = PySequence_Fast_GET_SIZE(fseq);
4487
Tim Peters8ce9f162004-08-27 01:49:32 +00004488 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004490 new_res_used = res_used + itemlen;
Tim Peters286085c2006-05-22 19:17:04 +00004491 if (new_res_used <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004492 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004493 if (i < seqlen - 1) {
4494 new_res_used += seplen;
Tim Peters286085c2006-05-22 19:17:04 +00004495 if (new_res_used <= 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004496 goto Overflow;
4497 }
4498 if (new_res_used > res_alloc) {
4499 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004500 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004501 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004502 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004503 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004504 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004505 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004506 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004508 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004509 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004511
4512 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004513 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004514 res_p += itemlen;
4515 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004516 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004517 res_p += seplen;
4518 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004520 res_used = new_res_used;
4521 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004522
Tim Peters05eba1f2004-08-27 21:32:02 +00004523 /* Shrink res to match the used area; this probably can't fail,
4524 * but it's cheap to check.
4525 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004526 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004527 goto onError;
4528
4529 Done:
4530 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004531 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004532 return (PyObject *)res;
4533
Tim Peters8ce9f162004-08-27 01:49:32 +00004534 Overflow:
4535 PyErr_SetString(PyExc_OverflowError,
4536 "join() is too long for a Python string");
4537 Py_DECREF(item);
4538 /* fall through */
4539
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004541 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004542 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004543 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004544 return NULL;
4545}
4546
Tim Petersced69f82003-09-16 20:30:58 +00004547static
4548PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004549 Py_ssize_t left,
4550 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004551 Py_UNICODE fill)
4552{
4553 PyUnicodeObject *u;
4554
4555 if (left < 0)
4556 left = 0;
4557 if (right < 0)
4558 right = 0;
4559
Tim Peters7a29bd52001-09-12 03:03:31 +00004560 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561 Py_INCREF(self);
4562 return self;
4563 }
4564
4565 u = _PyUnicode_New(left + self->length + right);
4566 if (u) {
4567 if (left)
4568 Py_UNICODE_FILL(u->str, fill, left);
4569 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4570 if (right)
4571 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4572 }
4573
4574 return u;
4575}
4576
4577#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004578 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579 if (!str) \
4580 goto onError; \
4581 if (PyList_Append(list, str)) { \
4582 Py_DECREF(str); \
4583 goto onError; \
4584 } \
4585 else \
4586 Py_DECREF(str);
4587
4588static
4589PyObject *split_whitespace(PyUnicodeObject *self,
4590 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004591 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004593 register Py_ssize_t i;
4594 register Py_ssize_t j;
4595 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596 PyObject *str;
4597
4598 for (i = j = 0; i < len; ) {
4599 /* find a token */
4600 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4601 i++;
4602 j = i;
4603 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4604 i++;
4605 if (j < i) {
4606 if (maxcount-- <= 0)
4607 break;
4608 SPLIT_APPEND(self->str, j, i);
4609 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4610 i++;
4611 j = i;
4612 }
4613 }
4614 if (j < len) {
4615 SPLIT_APPEND(self->str, j, len);
4616 }
4617 return list;
4618
4619 onError:
4620 Py_DECREF(list);
4621 return NULL;
4622}
4623
4624PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004625 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004627 register Py_ssize_t i;
4628 register Py_ssize_t j;
4629 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630 PyObject *list;
4631 PyObject *str;
4632 Py_UNICODE *data;
4633
4634 string = PyUnicode_FromObject(string);
4635 if (string == NULL)
4636 return NULL;
4637 data = PyUnicode_AS_UNICODE(string);
4638 len = PyUnicode_GET_SIZE(string);
4639
Guido van Rossumd57fd912000-03-10 22:53:23 +00004640 list = PyList_New(0);
4641 if (!list)
4642 goto onError;
4643
4644 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004645 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004646
Guido van Rossumd57fd912000-03-10 22:53:23 +00004647 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004648 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004649 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004650
4651 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004652 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653 if (i < len) {
4654 if (data[i] == '\r' && i + 1 < len &&
4655 data[i+1] == '\n')
4656 i += 2;
4657 else
4658 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004659 if (keepends)
4660 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004661 }
Guido van Rossum86662912000-04-11 15:38:46 +00004662 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663 j = i;
4664 }
4665 if (j < len) {
4666 SPLIT_APPEND(data, j, len);
4667 }
4668
4669 Py_DECREF(string);
4670 return list;
4671
4672 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004673 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674 Py_DECREF(string);
4675 return NULL;
4676}
4677
Tim Petersced69f82003-09-16 20:30:58 +00004678static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679PyObject *split_char(PyUnicodeObject *self,
4680 PyObject *list,
4681 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004682 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004684 register Py_ssize_t i;
4685 register Py_ssize_t j;
4686 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687 PyObject *str;
4688
4689 for (i = j = 0; i < len; ) {
4690 if (self->str[i] == ch) {
4691 if (maxcount-- <= 0)
4692 break;
4693 SPLIT_APPEND(self->str, j, i);
4694 i = j = i + 1;
4695 } else
4696 i++;
4697 }
4698 if (j <= len) {
4699 SPLIT_APPEND(self->str, j, len);
4700 }
4701 return list;
4702
4703 onError:
4704 Py_DECREF(list);
4705 return NULL;
4706}
4707
Tim Petersced69f82003-09-16 20:30:58 +00004708static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709PyObject *split_substring(PyUnicodeObject *self,
4710 PyObject *list,
4711 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004712 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004714 register Py_ssize_t i;
4715 register Py_ssize_t j;
4716 Py_ssize_t len = self->length;
4717 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718 PyObject *str;
4719
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004720 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721 if (Py_UNICODE_MATCH(self, i, substring)) {
4722 if (maxcount-- <= 0)
4723 break;
4724 SPLIT_APPEND(self->str, j, i);
4725 i = j = i + sublen;
4726 } else
4727 i++;
4728 }
4729 if (j <= len) {
4730 SPLIT_APPEND(self->str, j, len);
4731 }
4732 return list;
4733
4734 onError:
4735 Py_DECREF(list);
4736 return NULL;
4737}
4738
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004739static
4740PyObject *rsplit_whitespace(PyUnicodeObject *self,
4741 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004742 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004743{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004744 register Py_ssize_t i;
4745 register Py_ssize_t j;
4746 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004747 PyObject *str;
4748
4749 for (i = j = len - 1; i >= 0; ) {
4750 /* find a token */
4751 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4752 i--;
4753 j = i;
4754 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4755 i--;
4756 if (j > i) {
4757 if (maxcount-- <= 0)
4758 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004759 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004760 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4761 i--;
4762 j = i;
4763 }
4764 }
4765 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004766 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004767 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004768 if (PyList_Reverse(list) < 0)
4769 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004770 return list;
4771
4772 onError:
4773 Py_DECREF(list);
4774 return NULL;
4775}
4776
4777static
4778PyObject *rsplit_char(PyUnicodeObject *self,
4779 PyObject *list,
4780 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004781 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004782{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004783 register Py_ssize_t i;
4784 register Py_ssize_t j;
4785 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004786 PyObject *str;
4787
4788 for (i = j = len - 1; i >= 0; ) {
4789 if (self->str[i] == ch) {
4790 if (maxcount-- <= 0)
4791 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004792 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004793 j = i = i - 1;
4794 } else
4795 i--;
4796 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004797 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004798 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004799 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004800 if (PyList_Reverse(list) < 0)
4801 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004802 return list;
4803
4804 onError:
4805 Py_DECREF(list);
4806 return NULL;
4807}
4808
4809static
4810PyObject *rsplit_substring(PyUnicodeObject *self,
4811 PyObject *list,
4812 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004813 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004814{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004815 register Py_ssize_t i;
4816 register Py_ssize_t j;
4817 Py_ssize_t len = self->length;
4818 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004819 PyObject *str;
4820
4821 for (i = len - sublen, j = len; i >= 0; ) {
4822 if (Py_UNICODE_MATCH(self, i, substring)) {
4823 if (maxcount-- <= 0)
4824 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004825 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004826 j = i;
4827 i -= sublen;
4828 } else
4829 i--;
4830 }
4831 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004832 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004833 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004834 if (PyList_Reverse(list) < 0)
4835 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004836 return list;
4837
4838 onError:
4839 Py_DECREF(list);
4840 return NULL;
4841}
4842
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843#undef SPLIT_APPEND
4844
4845static
4846PyObject *split(PyUnicodeObject *self,
4847 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004848 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849{
4850 PyObject *list;
4851
4852 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004853 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854
4855 list = PyList_New(0);
4856 if (!list)
4857 return NULL;
4858
4859 if (substring == NULL)
4860 return split_whitespace(self,list,maxcount);
4861
4862 else if (substring->length == 1)
4863 return split_char(self,list,substring->str[0],maxcount);
4864
4865 else if (substring->length == 0) {
4866 Py_DECREF(list);
4867 PyErr_SetString(PyExc_ValueError, "empty separator");
4868 return NULL;
4869 }
4870 else
4871 return split_substring(self,list,substring,maxcount);
4872}
4873
Tim Petersced69f82003-09-16 20:30:58 +00004874static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004875PyObject *rsplit(PyUnicodeObject *self,
4876 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004877 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004878{
4879 PyObject *list;
4880
4881 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004882 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004883
4884 list = PyList_New(0);
4885 if (!list)
4886 return NULL;
4887
4888 if (substring == NULL)
4889 return rsplit_whitespace(self,list,maxcount);
4890
4891 else if (substring->length == 1)
4892 return rsplit_char(self,list,substring->str[0],maxcount);
4893
4894 else if (substring->length == 0) {
4895 Py_DECREF(list);
4896 PyErr_SetString(PyExc_ValueError, "empty separator");
4897 return NULL;
4898 }
4899 else
4900 return rsplit_substring(self,list,substring,maxcount);
4901}
4902
4903static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904PyObject *replace(PyUnicodeObject *self,
4905 PyUnicodeObject *str1,
4906 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004907 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908{
4909 PyUnicodeObject *u;
4910
4911 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004912 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913
Fredrik Lundh347ee272006-05-24 16:35:18 +00004914 if (str1->length == str2->length) {
4915 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004916 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004917 if (str1->length == 1) {
4918 /* replace characters */
4919 Py_UNICODE u1, u2;
4920 if (!findchar(self->str, self->length, str1->str[0]))
4921 goto nothing;
4922 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4923 if (!u)
4924 return NULL;
4925 Py_UNICODE_COPY(u->str, self->str, self->length);
4926 u1 = str1->str[0];
4927 u2 = str2->str[0];
4928 for (i = 0; i < u->length; i++)
4929 if (u->str[i] == u1) {
4930 if (--maxcount < 0)
4931 break;
4932 u->str[i] = u2;
4933 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00004935 i = fastsearch(
4936 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00004938 if (i < 0)
4939 goto nothing;
4940 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4941 if (!u)
4942 return NULL;
4943 Py_UNICODE_COPY(u->str, self->str, self->length);
4944 while (i <= self->length - str1->length)
4945 if (Py_UNICODE_MATCH(self, i, str1)) {
4946 if (--maxcount < 0)
4947 break;
4948 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
4949 i += str1->length;
4950 } else
4951 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004953 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00004954
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004955 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00004956 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957 Py_UNICODE *p;
4958
4959 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004960 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961 if (n > maxcount)
4962 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004963 if (n == 0)
4964 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00004965 /* new_size = self->length + n * (str2->length - str1->length)); */
4966 delta = (str2->length - str1->length);
4967 if (delta == 0) {
4968 new_size = self->length;
4969 } else {
4970 product = n * (str2->length - str1->length);
4971 if ((product / (str2->length - str1->length)) != n) {
4972 PyErr_SetString(PyExc_OverflowError,
4973 "replace string is too long");
4974 return NULL;
4975 }
4976 new_size = self->length + product;
4977 if (new_size < 0) {
4978 PyErr_SetString(PyExc_OverflowError,
4979 "replace string is too long");
4980 return NULL;
4981 }
4982 }
4983 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00004984 if (!u)
4985 return NULL;
4986 i = 0;
4987 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004988 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004989 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004990 while (n-- > 0) {
4991 /* look for next match */
4992 j = i;
4993 while (j <= e) {
4994 if (Py_UNICODE_MATCH(self, j, str1))
4995 break;
4996 j++;
4997 }
4998 if (j > i) {
4999 if (j > e)
5000 break;
5001 /* copy unchanged part [i:j] */
5002 Py_UNICODE_COPY(p, self->str+i, j-i);
5003 p += j - i;
5004 }
5005 /* copy substitution string */
5006 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005007 Py_UNICODE_COPY(p, str2->str, str2->length);
5008 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005009 }
5010 i = j + str1->length;
5011 }
5012 if (i < self->length)
5013 /* copy tail [i:] */
5014 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005015 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005016 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005017 while (n > 0) {
5018 Py_UNICODE_COPY(p, str2->str, str2->length);
5019 p += str2->length;
5020 if (--n <= 0)
5021 break;
5022 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005023 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005024 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005025 }
5026 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005027 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005028
5029nothing:
5030 /* nothing to replace; return original string (when possible) */
5031 if (PyUnicode_CheckExact(self)) {
5032 Py_INCREF(self);
5033 return (PyObject *) self;
5034 }
5035 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036}
5037
5038/* --- Unicode Object Methods --------------------------------------------- */
5039
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005040PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041"S.title() -> unicode\n\
5042\n\
5043Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005044characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005045
5046static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005047unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049 return fixup(self, fixtitle);
5050}
5051
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005052PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053"S.capitalize() -> unicode\n\
5054\n\
5055Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005056have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057
5058static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005059unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005061 return fixup(self, fixcapitalize);
5062}
5063
5064#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005065PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066"S.capwords() -> unicode\n\
5067\n\
5068Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005069normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070
5071static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005072unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073{
5074 PyObject *list;
5075 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005076 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005077
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078 /* Split into words */
5079 list = split(self, NULL, -1);
5080 if (!list)
5081 return NULL;
5082
5083 /* Capitalize each word */
5084 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5085 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5086 fixcapitalize);
5087 if (item == NULL)
5088 goto onError;
5089 Py_DECREF(PyList_GET_ITEM(list, i));
5090 PyList_SET_ITEM(list, i, item);
5091 }
5092
5093 /* Join the words to form a new string */
5094 item = PyUnicode_Join(NULL, list);
5095
5096onError:
5097 Py_DECREF(list);
5098 return (PyObject *)item;
5099}
5100#endif
5101
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005102/* Argument converter. Coerces to a single unicode character */
5103
5104static int
5105convert_uc(PyObject *obj, void *addr)
5106{
5107 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5108 PyObject *uniobj;
5109 Py_UNICODE *unistr;
5110
5111 uniobj = PyUnicode_FromObject(obj);
5112 if (uniobj == NULL) {
5113 PyErr_SetString(PyExc_TypeError,
5114 "The fill character cannot be converted to Unicode");
5115 return 0;
5116 }
5117 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5118 PyErr_SetString(PyExc_TypeError,
5119 "The fill character must be exactly one character long");
5120 Py_DECREF(uniobj);
5121 return 0;
5122 }
5123 unistr = PyUnicode_AS_UNICODE(uniobj);
5124 *fillcharloc = unistr[0];
5125 Py_DECREF(uniobj);
5126 return 1;
5127}
5128
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005129PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005130"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005132Return S centered in a Unicode string of length width. Padding is\n\
5133done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134
5135static PyObject *
5136unicode_center(PyUnicodeObject *self, PyObject *args)
5137{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005138 Py_ssize_t marg, left;
5139 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005140 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141
Thomas Woutersde017742006-02-16 19:34:37 +00005142 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143 return NULL;
5144
Tim Peters7a29bd52001-09-12 03:03:31 +00005145 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146 Py_INCREF(self);
5147 return (PyObject*) self;
5148 }
5149
5150 marg = width - self->length;
5151 left = marg / 2 + (marg & width & 1);
5152
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005153 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154}
5155
Marc-André Lemburge5034372000-08-08 08:04:29 +00005156#if 0
5157
5158/* This code should go into some future Unicode collation support
5159 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005160 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005161
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005162/* speedy UTF-16 code point order comparison */
5163/* gleaned from: */
5164/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5165
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005166static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005167{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005168 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005169 0, 0, 0, 0, 0, 0, 0, 0,
5170 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005171 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005172};
5173
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174static int
5175unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5176{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005177 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005178
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 Py_UNICODE *s1 = str1->str;
5180 Py_UNICODE *s2 = str2->str;
5181
5182 len1 = str1->length;
5183 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005184
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005186 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005187
5188 c1 = *s1++;
5189 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005190
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005191 if (c1 > (1<<11) * 26)
5192 c1 += utf16Fixup[c1>>11];
5193 if (c2 > (1<<11) * 26)
5194 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005195 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005196
5197 if (c1 != c2)
5198 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005199
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005200 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 }
5202
5203 return (len1 < len2) ? -1 : (len1 != len2);
5204}
5205
Marc-André Lemburge5034372000-08-08 08:04:29 +00005206#else
5207
5208static int
5209unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5210{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005211 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005212
5213 Py_UNICODE *s1 = str1->str;
5214 Py_UNICODE *s2 = str2->str;
5215
5216 len1 = str1->length;
5217 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005218
Marc-André Lemburge5034372000-08-08 08:04:29 +00005219 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005220 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005221
Fredrik Lundh45714e92001-06-26 16:39:36 +00005222 c1 = *s1++;
5223 c2 = *s2++;
5224
5225 if (c1 != c2)
5226 return (c1 < c2) ? -1 : 1;
5227
Marc-André Lemburge5034372000-08-08 08:04:29 +00005228 len1--; len2--;
5229 }
5230
5231 return (len1 < len2) ? -1 : (len1 != len2);
5232}
5233
5234#endif
5235
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236int PyUnicode_Compare(PyObject *left,
5237 PyObject *right)
5238{
5239 PyUnicodeObject *u = NULL, *v = NULL;
5240 int result;
5241
5242 /* Coerce the two arguments */
5243 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5244 if (u == NULL)
5245 goto onError;
5246 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5247 if (v == NULL)
5248 goto onError;
5249
Thomas Wouters7e474022000-07-16 12:04:32 +00005250 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251 if (v == u) {
5252 Py_DECREF(u);
5253 Py_DECREF(v);
5254 return 0;
5255 }
5256
5257 result = unicode_compare(u, v);
5258
5259 Py_DECREF(u);
5260 Py_DECREF(v);
5261 return result;
5262
5263onError:
5264 Py_XDECREF(u);
5265 Py_XDECREF(v);
5266 return -1;
5267}
5268
Guido van Rossum403d68b2000-03-13 15:55:09 +00005269int PyUnicode_Contains(PyObject *container,
5270 PyObject *element)
5271{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005272 PyObject *str, *sub;
5273 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005274
5275 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005276 sub = PyUnicode_FromObject(element);
5277 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005278 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005279 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005280 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005281 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005282
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005283 str = PyUnicode_FromObject(container);
5284 if (!str) {
5285 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005286 return -1;
5287 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005288
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005289 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00005290
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005291 Py_DECREF(str);
5292 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00005293
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005294 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005295}
5296
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297/* Concat to string or Unicode object giving a new Unicode object. */
5298
5299PyObject *PyUnicode_Concat(PyObject *left,
5300 PyObject *right)
5301{
5302 PyUnicodeObject *u = NULL, *v = NULL, *w;
5303
5304 /* Coerce the two arguments */
5305 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5306 if (u == NULL)
5307 goto onError;
5308 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5309 if (v == NULL)
5310 goto onError;
5311
5312 /* Shortcuts */
5313 if (v == unicode_empty) {
5314 Py_DECREF(v);
5315 return (PyObject *)u;
5316 }
5317 if (u == unicode_empty) {
5318 Py_DECREF(u);
5319 return (PyObject *)v;
5320 }
5321
5322 /* Concat the two Unicode strings */
5323 w = _PyUnicode_New(u->length + v->length);
5324 if (w == NULL)
5325 goto onError;
5326 Py_UNICODE_COPY(w->str, u->str, u->length);
5327 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5328
5329 Py_DECREF(u);
5330 Py_DECREF(v);
5331 return (PyObject *)w;
5332
5333onError:
5334 Py_XDECREF(u);
5335 Py_XDECREF(v);
5336 return NULL;
5337}
5338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005339PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340"S.count(sub[, start[, end]]) -> int\n\
5341\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005342Return the number of non-overlapping occurrences of substring sub in\n\
5343Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005344interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345
5346static PyObject *
5347unicode_count(PyUnicodeObject *self, PyObject *args)
5348{
5349 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005350 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005351 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352 PyObject *result;
5353
Guido van Rossumb8872e62000-05-09 14:14:27 +00005354 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5355 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356 return NULL;
5357
5358 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005359 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 if (substring == NULL)
5361 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005362
Fredrik Lundhc8162812006-05-26 19:33:03 +00005363 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005365 result = PyInt_FromSsize_t(
5366 stringlib_count(self->str + start, end - start,
5367 substring->str, substring->length)
5368 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369
5370 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005371
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 return result;
5373}
5374
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005375PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005376"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005378Encodes S using the codec registered for encoding. encoding defaults\n\
5379to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005380handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005381a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5382'xmlcharrefreplace' as well as any other name registered with\n\
5383codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384
5385static PyObject *
5386unicode_encode(PyUnicodeObject *self, PyObject *args)
5387{
5388 char *encoding = NULL;
5389 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005390 PyObject *v;
5391
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5393 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005394 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005395 if (v == NULL)
5396 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005397 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5398 PyErr_Format(PyExc_TypeError,
5399 "encoder did not return a string/unicode object "
5400 "(type=%.400s)",
5401 v->ob_type->tp_name);
5402 Py_DECREF(v);
5403 return NULL;
5404 }
5405 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005406
5407 onError:
5408 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005409}
5410
5411PyDoc_STRVAR(decode__doc__,
5412"S.decode([encoding[,errors]]) -> string or unicode\n\
5413\n\
5414Decodes S using the codec registered for encoding. encoding defaults\n\
5415to the default encoding. errors may be given to set a different error\n\
5416handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5417a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5418as well as any other name registerd with codecs.register_error that is\n\
5419able to handle UnicodeDecodeErrors.");
5420
5421static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005422unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005423{
5424 char *encoding = NULL;
5425 char *errors = NULL;
5426 PyObject *v;
5427
5428 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5429 return NULL;
5430 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005431 if (v == NULL)
5432 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005433 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5434 PyErr_Format(PyExc_TypeError,
5435 "decoder did not return a string/unicode object "
5436 "(type=%.400s)",
5437 v->ob_type->tp_name);
5438 Py_DECREF(v);
5439 return NULL;
5440 }
5441 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005442
5443 onError:
5444 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445}
5446
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005447PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448"S.expandtabs([tabsize]) -> unicode\n\
5449\n\
5450Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005451If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452
5453static PyObject*
5454unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5455{
5456 Py_UNICODE *e;
5457 Py_UNICODE *p;
5458 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005459 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 PyUnicodeObject *u;
5461 int tabsize = 8;
5462
5463 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5464 return NULL;
5465
Thomas Wouters7e474022000-07-16 12:04:32 +00005466 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 i = j = 0;
5468 e = self->str + self->length;
5469 for (p = self->str; p < e; p++)
5470 if (*p == '\t') {
5471 if (tabsize > 0)
5472 j += tabsize - (j % tabsize);
5473 }
5474 else {
5475 j++;
5476 if (*p == '\n' || *p == '\r') {
5477 i += j;
5478 j = 0;
5479 }
5480 }
5481
5482 /* Second pass: create output string and fill it */
5483 u = _PyUnicode_New(i + j);
5484 if (!u)
5485 return NULL;
5486
5487 j = 0;
5488 q = u->str;
5489
5490 for (p = self->str; p < e; p++)
5491 if (*p == '\t') {
5492 if (tabsize > 0) {
5493 i = tabsize - (j % tabsize);
5494 j += i;
5495 while (i--)
5496 *q++ = ' ';
5497 }
5498 }
5499 else {
5500 j++;
5501 *q++ = *p;
5502 if (*p == '\n' || *p == '\r')
5503 j = 0;
5504 }
5505
5506 return (PyObject*) u;
5507}
5508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005509PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510"S.find(sub [,start [,end]]) -> int\n\
5511\n\
5512Return the lowest index in S where substring sub is found,\n\
5513such that sub is contained within s[start,end]. Optional\n\
5514arguments start and end are interpreted as in slice notation.\n\
5515\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005516Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517
5518static PyObject *
5519unicode_find(PyUnicodeObject *self, PyObject *args)
5520{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005521 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005522 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005523 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005524 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525
Guido van Rossumb8872e62000-05-09 14:14:27 +00005526 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5527 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005529 substring = PyUnicode_FromObject(substring);
5530 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 return NULL;
5532
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005533 result = stringlib_find_slice(
5534 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5535 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5536 start, end
5537 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538
5539 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005540
5541 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542}
5543
5544static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005545unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546{
5547 if (index < 0 || index >= self->length) {
5548 PyErr_SetString(PyExc_IndexError, "string index out of range");
5549 return NULL;
5550 }
5551
5552 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5553}
5554
5555static long
5556unicode_hash(PyUnicodeObject *self)
5557{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005558 /* Since Unicode objects compare equal to their ASCII string
5559 counterparts, they should use the individual character values
5560 as basis for their hash value. This is needed to assure that
5561 strings and Unicode objects behave in the same way as
5562 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563
Martin v. Löwis18e16552006-02-15 17:27:45 +00005564 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005565 register Py_UNICODE *p;
5566 register long x;
5567
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568 if (self->hash != -1)
5569 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005570 len = PyUnicode_GET_SIZE(self);
5571 p = PyUnicode_AS_UNICODE(self);
5572 x = *p << 7;
5573 while (--len >= 0)
5574 x = (1000003*x) ^ *p++;
5575 x ^= PyUnicode_GET_SIZE(self);
5576 if (x == -1)
5577 x = -2;
5578 self->hash = x;
5579 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580}
5581
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005582PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583"S.index(sub [,start [,end]]) -> int\n\
5584\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005585Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586
5587static PyObject *
5588unicode_index(PyUnicodeObject *self, PyObject *args)
5589{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005590 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005591 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005592 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005593 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594
Guido van Rossumb8872e62000-05-09 14:14:27 +00005595 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5596 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005598 substring = PyUnicode_FromObject(substring);
5599 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600 return NULL;
5601
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005602 result = stringlib_find_slice(
5603 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5604 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5605 start, end
5606 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607
5608 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005609
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 if (result < 0) {
5611 PyErr_SetString(PyExc_ValueError, "substring not found");
5612 return NULL;
5613 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005614
Martin v. Löwis18e16552006-02-15 17:27:45 +00005615 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616}
5617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005618PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005619"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005621Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005622at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623
5624static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005625unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626{
5627 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5628 register const Py_UNICODE *e;
5629 int cased;
5630
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 /* Shortcut for single character strings */
5632 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005633 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005635 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005636 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005637 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005638
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639 e = p + PyUnicode_GET_SIZE(self);
5640 cased = 0;
5641 for (; p < e; p++) {
5642 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005643
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005645 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 else if (!cased && Py_UNICODE_ISLOWER(ch))
5647 cased = 1;
5648 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005649 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650}
5651
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005652PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005653"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005655Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005656at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657
5658static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005659unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660{
5661 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5662 register const Py_UNICODE *e;
5663 int cased;
5664
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 /* Shortcut for single character strings */
5666 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005667 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005669 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005670 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005671 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005672
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673 e = p + PyUnicode_GET_SIZE(self);
5674 cased = 0;
5675 for (; p < e; p++) {
5676 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005677
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005679 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 else if (!cased && Py_UNICODE_ISUPPER(ch))
5681 cased = 1;
5682 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005683 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684}
5685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005686PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005687"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005689Return True if S is a titlecased string and there is at least one\n\
5690character in S, i.e. upper- and titlecase characters may only\n\
5691follow uncased characters and lowercase characters only cased ones.\n\
5692Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693
5694static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005695unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696{
5697 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5698 register const Py_UNICODE *e;
5699 int cased, previous_is_cased;
5700
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 /* Shortcut for single character strings */
5702 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005703 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5704 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005706 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005707 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005708 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005709
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 e = p + PyUnicode_GET_SIZE(self);
5711 cased = 0;
5712 previous_is_cased = 0;
5713 for (; p < e; p++) {
5714 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005715
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5717 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005718 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 previous_is_cased = 1;
5720 cased = 1;
5721 }
5722 else if (Py_UNICODE_ISLOWER(ch)) {
5723 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005724 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725 previous_is_cased = 1;
5726 cased = 1;
5727 }
5728 else
5729 previous_is_cased = 0;
5730 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005731 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732}
5733
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005734PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005735"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005737Return True if all characters in S are whitespace\n\
5738and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739
5740static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005741unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742{
5743 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5744 register const Py_UNICODE *e;
5745
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 /* Shortcut for single character strings */
5747 if (PyUnicode_GET_SIZE(self) == 1 &&
5748 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005749 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005751 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005752 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005753 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005754
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 e = p + PyUnicode_GET_SIZE(self);
5756 for (; p < e; p++) {
5757 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005758 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005760 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761}
5762
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005763PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005764"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005765\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005766Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005767and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005768
5769static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005770unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005771{
5772 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5773 register const Py_UNICODE *e;
5774
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005775 /* Shortcut for single character strings */
5776 if (PyUnicode_GET_SIZE(self) == 1 &&
5777 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005778 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005779
5780 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005781 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005782 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005783
5784 e = p + PyUnicode_GET_SIZE(self);
5785 for (; p < e; p++) {
5786 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005787 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005788 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005789 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005790}
5791
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005792PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005793"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005794\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005795Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005796and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005797
5798static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005799unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005800{
5801 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5802 register const Py_UNICODE *e;
5803
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005804 /* Shortcut for single character strings */
5805 if (PyUnicode_GET_SIZE(self) == 1 &&
5806 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005807 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005808
5809 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005810 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005811 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005812
5813 e = p + PyUnicode_GET_SIZE(self);
5814 for (; p < e; p++) {
5815 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005816 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005817 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005818 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005819}
5820
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005821PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005822"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005824Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005825False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826
5827static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005828unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829{
5830 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5831 register const Py_UNICODE *e;
5832
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833 /* Shortcut for single character strings */
5834 if (PyUnicode_GET_SIZE(self) == 1 &&
5835 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005836 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005838 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005839 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005840 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005841
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 e = p + PyUnicode_GET_SIZE(self);
5843 for (; p < e; p++) {
5844 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005845 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005847 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848}
5849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005850PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005851"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005853Return True if all characters in S are digits\n\
5854and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855
5856static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005857unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858{
5859 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5860 register const Py_UNICODE *e;
5861
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 /* Shortcut for single character strings */
5863 if (PyUnicode_GET_SIZE(self) == 1 &&
5864 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005865 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005867 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005868 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005869 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005870
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 e = p + PyUnicode_GET_SIZE(self);
5872 for (; p < e; p++) {
5873 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005874 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005876 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877}
5878
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005879PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005880"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005882Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005883False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884
5885static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005886unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887{
5888 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5889 register const Py_UNICODE *e;
5890
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 /* Shortcut for single character strings */
5892 if (PyUnicode_GET_SIZE(self) == 1 &&
5893 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005894 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005896 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005897 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005898 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005899
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900 e = p + PyUnicode_GET_SIZE(self);
5901 for (; p < e; p++) {
5902 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005903 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005905 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906}
5907
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005908PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909"S.join(sequence) -> unicode\n\
5910\n\
5911Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005912sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913
5914static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005915unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005917 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918}
5919
Martin v. Löwis18e16552006-02-15 17:27:45 +00005920static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921unicode_length(PyUnicodeObject *self)
5922{
5923 return self->length;
5924}
5925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005926PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005927"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928\n\
5929Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005930done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931
5932static PyObject *
5933unicode_ljust(PyUnicodeObject *self, PyObject *args)
5934{
Martin v. Löwis412fb672006-04-13 06:34:32 +00005935 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005936 Py_UNICODE fillchar = ' ';
5937
Martin v. Löwis412fb672006-04-13 06:34:32 +00005938 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 return NULL;
5940
Tim Peters7a29bd52001-09-12 03:03:31 +00005941 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 Py_INCREF(self);
5943 return (PyObject*) self;
5944 }
5945
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005946 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947}
5948
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005949PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950"S.lower() -> unicode\n\
5951\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005952Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953
5954static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005955unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 return fixup(self, fixlower);
5958}
5959
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005960#define LEFTSTRIP 0
5961#define RIGHTSTRIP 1
5962#define BOTHSTRIP 2
5963
5964/* Arrays indexed by above */
5965static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5966
5967#define STRIPNAME(i) (stripformat[i]+3)
5968
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005969/* externally visible for str.strip(unicode) */
5970PyObject *
5971_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5972{
5973 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005974 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005975 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005976 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
5977 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005978
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005979 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
5980
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005981 i = 0;
5982 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005983 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
5984 i++;
5985 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005986 }
5987
5988 j = len;
5989 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005990 do {
5991 j--;
5992 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
5993 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005994 }
5995
5996 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005997 Py_INCREF(self);
5998 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005999 }
6000 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006001 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006002}
6003
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004
6005static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006006do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006008 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006009 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006010
6011 i = 0;
6012 if (striptype != RIGHTSTRIP) {
6013 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6014 i++;
6015 }
6016 }
6017
6018 j = len;
6019 if (striptype != LEFTSTRIP) {
6020 do {
6021 j--;
6022 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6023 j++;
6024 }
6025
6026 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6027 Py_INCREF(self);
6028 return (PyObject*)self;
6029 }
6030 else
6031 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032}
6033
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006034
6035static PyObject *
6036do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6037{
6038 PyObject *sep = NULL;
6039
6040 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6041 return NULL;
6042
6043 if (sep != NULL && sep != Py_None) {
6044 if (PyUnicode_Check(sep))
6045 return _PyUnicode_XStrip(self, striptype, sep);
6046 else if (PyString_Check(sep)) {
6047 PyObject *res;
6048 sep = PyUnicode_FromObject(sep);
6049 if (sep==NULL)
6050 return NULL;
6051 res = _PyUnicode_XStrip(self, striptype, sep);
6052 Py_DECREF(sep);
6053 return res;
6054 }
6055 else {
6056 PyErr_Format(PyExc_TypeError,
6057 "%s arg must be None, unicode or str",
6058 STRIPNAME(striptype));
6059 return NULL;
6060 }
6061 }
6062
6063 return do_strip(self, striptype);
6064}
6065
6066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006067PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006068"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006069\n\
6070Return a copy of the string S with leading and trailing\n\
6071whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006072If chars is given and not None, remove characters in chars instead.\n\
6073If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006074
6075static PyObject *
6076unicode_strip(PyUnicodeObject *self, PyObject *args)
6077{
6078 if (PyTuple_GET_SIZE(args) == 0)
6079 return do_strip(self, BOTHSTRIP); /* Common case */
6080 else
6081 return do_argstrip(self, BOTHSTRIP, args);
6082}
6083
6084
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006085PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006086"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006087\n\
6088Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006089If chars is given and not None, remove characters in chars instead.\n\
6090If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006091
6092static PyObject *
6093unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6094{
6095 if (PyTuple_GET_SIZE(args) == 0)
6096 return do_strip(self, LEFTSTRIP); /* Common case */
6097 else
6098 return do_argstrip(self, LEFTSTRIP, args);
6099}
6100
6101
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006102PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006103"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006104\n\
6105Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006106If chars is given and not None, remove characters in chars instead.\n\
6107If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006108
6109static PyObject *
6110unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6111{
6112 if (PyTuple_GET_SIZE(args) == 0)
6113 return do_strip(self, RIGHTSTRIP); /* Common case */
6114 else
6115 return do_argstrip(self, RIGHTSTRIP, args);
6116}
6117
6118
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006120unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121{
6122 PyUnicodeObject *u;
6123 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006124 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006125 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126
6127 if (len < 0)
6128 len = 0;
6129
Tim Peters7a29bd52001-09-12 03:03:31 +00006130 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 /* no repeat, return original string */
6132 Py_INCREF(str);
6133 return (PyObject*) str;
6134 }
Tim Peters8f422462000-09-09 06:13:41 +00006135
6136 /* ensure # of chars needed doesn't overflow int and # of bytes
6137 * needed doesn't overflow size_t
6138 */
6139 nchars = len * str->length;
6140 if (len && nchars / len != str->length) {
6141 PyErr_SetString(PyExc_OverflowError,
6142 "repeated string is too long");
6143 return NULL;
6144 }
6145 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6146 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6147 PyErr_SetString(PyExc_OverflowError,
6148 "repeated string is too long");
6149 return NULL;
6150 }
6151 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152 if (!u)
6153 return NULL;
6154
6155 p = u->str;
6156
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006157 if (str->length == 1 && len > 0) {
6158 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006159 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006160 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006161 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006162 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006163 done = str->length;
6164 }
6165 while (done < nchars) {
6166 int n = (done <= nchars-done) ? done : nchars-done;
6167 Py_UNICODE_COPY(p+done, p, n);
6168 done += n;
6169 }
6170 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171
6172 return (PyObject*) u;
6173}
6174
6175PyObject *PyUnicode_Replace(PyObject *obj,
6176 PyObject *subobj,
6177 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006178 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179{
6180 PyObject *self;
6181 PyObject *str1;
6182 PyObject *str2;
6183 PyObject *result;
6184
6185 self = PyUnicode_FromObject(obj);
6186 if (self == NULL)
6187 return NULL;
6188 str1 = PyUnicode_FromObject(subobj);
6189 if (str1 == NULL) {
6190 Py_DECREF(self);
6191 return NULL;
6192 }
6193 str2 = PyUnicode_FromObject(replobj);
6194 if (str2 == NULL) {
6195 Py_DECREF(self);
6196 Py_DECREF(str1);
6197 return NULL;
6198 }
Tim Petersced69f82003-09-16 20:30:58 +00006199 result = replace((PyUnicodeObject *)self,
6200 (PyUnicodeObject *)str1,
6201 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202 maxcount);
6203 Py_DECREF(self);
6204 Py_DECREF(str1);
6205 Py_DECREF(str2);
6206 return result;
6207}
6208
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006209PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210"S.replace (old, new[, maxsplit]) -> unicode\n\
6211\n\
6212Return a copy of S with all occurrences of substring\n\
6213old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006214given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215
6216static PyObject*
6217unicode_replace(PyUnicodeObject *self, PyObject *args)
6218{
6219 PyUnicodeObject *str1;
6220 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006221 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222 PyObject *result;
6223
Martin v. Löwis18e16552006-02-15 17:27:45 +00006224 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 return NULL;
6226 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6227 if (str1 == NULL)
6228 return NULL;
6229 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006230 if (str2 == NULL) {
6231 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006233 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234
6235 result = replace(self, str1, str2, maxcount);
6236
6237 Py_DECREF(str1);
6238 Py_DECREF(str2);
6239 return result;
6240}
6241
6242static
6243PyObject *unicode_repr(PyObject *unicode)
6244{
6245 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6246 PyUnicode_GET_SIZE(unicode),
6247 1);
6248}
6249
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006250PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251"S.rfind(sub [,start [,end]]) -> int\n\
6252\n\
6253Return the highest index in S where substring sub is found,\n\
6254such that sub is contained within s[start,end]. Optional\n\
6255arguments start and end are interpreted as in slice notation.\n\
6256\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006257Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258
6259static PyObject *
6260unicode_rfind(PyUnicodeObject *self, PyObject *args)
6261{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006262 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006263 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006264 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006265 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266
Guido van Rossumb8872e62000-05-09 14:14:27 +00006267 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6268 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006270 substring = PyUnicode_FromObject(substring);
6271 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 return NULL;
6273
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006274 result = stringlib_rfind_slice(
6275 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6276 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6277 start, end
6278 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279
6280 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006281
6282 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283}
6284
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006285PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286"S.rindex(sub [,start [,end]]) -> int\n\
6287\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006288Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289
6290static PyObject *
6291unicode_rindex(PyUnicodeObject *self, PyObject *args)
6292{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006293 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006294 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006295 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006296 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297
Guido van Rossumb8872e62000-05-09 14:14:27 +00006298 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6299 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006301 substring = PyUnicode_FromObject(substring);
6302 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 return NULL;
6304
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006305 result = stringlib_rfind_slice(
6306 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6307 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6308 start, end
6309 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310
6311 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006312
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313 if (result < 0) {
6314 PyErr_SetString(PyExc_ValueError, "substring not found");
6315 return NULL;
6316 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006317 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318}
6319
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006320PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006321"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322\n\
6323Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006324done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325
6326static PyObject *
6327unicode_rjust(PyUnicodeObject *self, PyObject *args)
6328{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006329 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006330 Py_UNICODE fillchar = ' ';
6331
Martin v. Löwis412fb672006-04-13 06:34:32 +00006332 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333 return NULL;
6334
Tim Peters7a29bd52001-09-12 03:03:31 +00006335 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336 Py_INCREF(self);
6337 return (PyObject*) self;
6338 }
6339
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006340 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341}
6342
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006344unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345{
6346 /* standard clamping */
6347 if (start < 0)
6348 start = 0;
6349 if (end < 0)
6350 end = 0;
6351 if (end > self->length)
6352 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006353 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354 /* full slice, return original string */
6355 Py_INCREF(self);
6356 return (PyObject*) self;
6357 }
6358 if (start > end)
6359 start = end;
6360 /* copy slice */
6361 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6362 end - start);
6363}
6364
6365PyObject *PyUnicode_Split(PyObject *s,
6366 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006367 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368{
6369 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006370
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 s = PyUnicode_FromObject(s);
6372 if (s == NULL)
6373 return NULL;
6374 if (sep != NULL) {
6375 sep = PyUnicode_FromObject(sep);
6376 if (sep == NULL) {
6377 Py_DECREF(s);
6378 return NULL;
6379 }
6380 }
6381
6382 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6383
6384 Py_DECREF(s);
6385 Py_XDECREF(sep);
6386 return result;
6387}
6388
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006389PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390"S.split([sep [,maxsplit]]) -> list of strings\n\
6391\n\
6392Return a list of the words in S, using sep as the\n\
6393delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006394splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006395any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396
6397static PyObject*
6398unicode_split(PyUnicodeObject *self, PyObject *args)
6399{
6400 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006401 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402
Martin v. Löwis18e16552006-02-15 17:27:45 +00006403 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404 return NULL;
6405
6406 if (substring == Py_None)
6407 return split(self, NULL, maxcount);
6408 else if (PyUnicode_Check(substring))
6409 return split(self, (PyUnicodeObject *)substring, maxcount);
6410 else
6411 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6412}
6413
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006414PyObject *
6415PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6416{
6417 PyObject* str_obj;
6418 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006419 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00006420
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006421 str_obj = PyUnicode_FromObject(str_in);
6422 if (!str_obj)
6423 return NULL;
6424 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00006425 if (!sep_obj) {
6426 Py_DECREF(str_obj);
6427 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006428 }
6429
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006430 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00006431 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6432 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6433 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006434
Fredrik Lundhb9479482006-05-26 17:22:38 +00006435 Py_DECREF(sep_obj);
6436 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006437
6438 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006439}
6440
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006441
6442PyObject *
6443PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6444{
6445 PyObject* str_obj;
6446 PyObject* sep_obj;
6447 PyObject* out;
6448
6449 str_obj = PyUnicode_FromObject(str_in);
6450 if (!str_obj)
6451 return NULL;
6452 sep_obj = PyUnicode_FromObject(sep_in);
6453 if (!sep_obj) {
6454 Py_DECREF(str_obj);
6455 return NULL;
6456 }
6457
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006458 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006459 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6460 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6461 );
6462
6463 Py_DECREF(sep_obj);
6464 Py_DECREF(str_obj);
6465
6466 return out;
6467}
6468
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006469PyDoc_STRVAR(partition__doc__,
6470"S.partition(sep) -> (head, sep, tail)\n\
6471\n\
6472Searches for the separator sep in S, and returns the part before it,\n\
6473the separator itself, and the part after it. If the separator is not\n\
6474found, returns S and two empty strings.");
6475
6476static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00006477unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006478{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006479 return PyUnicode_Partition((PyObject *)self, separator);
6480}
6481
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006482PyDoc_STRVAR(rpartition__doc__,
6483"S.rpartition(sep) -> (head, sep, tail)\n\
6484\n\
6485Searches for the separator sep in S, starting at the end of S, and returns\n\
6486the part before it, the separator itself, and the part after it. If the\n\
6487separator is not found, returns S and two empty strings.");
6488
6489static PyObject*
6490unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6491{
6492 return PyUnicode_RPartition((PyObject *)self, separator);
6493}
6494
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006495PyObject *PyUnicode_RSplit(PyObject *s,
6496 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006497 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006498{
6499 PyObject *result;
6500
6501 s = PyUnicode_FromObject(s);
6502 if (s == NULL)
6503 return NULL;
6504 if (sep != NULL) {
6505 sep = PyUnicode_FromObject(sep);
6506 if (sep == NULL) {
6507 Py_DECREF(s);
6508 return NULL;
6509 }
6510 }
6511
6512 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6513
6514 Py_DECREF(s);
6515 Py_XDECREF(sep);
6516 return result;
6517}
6518
6519PyDoc_STRVAR(rsplit__doc__,
6520"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6521\n\
6522Return a list of the words in S, using sep as the\n\
6523delimiter string, starting at the end of the string and\n\
6524working to the front. If maxsplit is given, at most maxsplit\n\
6525splits are done. If sep is not specified, any whitespace string\n\
6526is a separator.");
6527
6528static PyObject*
6529unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6530{
6531 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006532 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006533
Martin v. Löwis18e16552006-02-15 17:27:45 +00006534 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006535 return NULL;
6536
6537 if (substring == Py_None)
6538 return rsplit(self, NULL, maxcount);
6539 else if (PyUnicode_Check(substring))
6540 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6541 else
6542 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6543}
6544
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006545PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006546"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547\n\
6548Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006549Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006550is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551
6552static PyObject*
6553unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6554{
Guido van Rossum86662912000-04-11 15:38:46 +00006555 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556
Guido van Rossum86662912000-04-11 15:38:46 +00006557 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 return NULL;
6559
Guido van Rossum86662912000-04-11 15:38:46 +00006560 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561}
6562
6563static
6564PyObject *unicode_str(PyUnicodeObject *self)
6565{
Fred Drakee4315f52000-05-09 19:53:39 +00006566 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567}
6568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006569PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570"S.swapcase() -> unicode\n\
6571\n\
6572Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006573and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574
6575static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006576unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578 return fixup(self, fixswapcase);
6579}
6580
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006581PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582"S.translate(table) -> unicode\n\
6583\n\
6584Return a copy of the string S, where all characters have been mapped\n\
6585through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006586Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6587Unmapped characters are left untouched. Characters mapped to None\n\
6588are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589
6590static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006591unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592{
Tim Petersced69f82003-09-16 20:30:58 +00006593 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006595 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596 "ignore");
6597}
6598
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006599PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600"S.upper() -> unicode\n\
6601\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006602Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603
6604static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006605unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 return fixup(self, fixupper);
6608}
6609
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006610PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611"S.zfill(width) -> unicode\n\
6612\n\
6613Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006614of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615
6616static PyObject *
6617unicode_zfill(PyUnicodeObject *self, PyObject *args)
6618{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006619 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 PyUnicodeObject *u;
6621
Martin v. Löwis18e16552006-02-15 17:27:45 +00006622 Py_ssize_t width;
6623 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 return NULL;
6625
6626 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006627 if (PyUnicode_CheckExact(self)) {
6628 Py_INCREF(self);
6629 return (PyObject*) self;
6630 }
6631 else
6632 return PyUnicode_FromUnicode(
6633 PyUnicode_AS_UNICODE(self),
6634 PyUnicode_GET_SIZE(self)
6635 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636 }
6637
6638 fill = width - self->length;
6639
6640 u = pad(self, fill, 0, '0');
6641
Walter Dörwald068325e2002-04-15 13:36:47 +00006642 if (u == NULL)
6643 return NULL;
6644
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 if (u->str[fill] == '+' || u->str[fill] == '-') {
6646 /* move sign to beginning of string */
6647 u->str[0] = u->str[fill];
6648 u->str[fill] = '0';
6649 }
6650
6651 return (PyObject*) u;
6652}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653
6654#if 0
6655static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006656unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658 return PyInt_FromLong(unicode_freelist_size);
6659}
6660#endif
6661
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006662PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006663"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006665Return True if S starts with the specified prefix, False otherwise.\n\
6666With optional start, test S beginning at that position.\n\
6667With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668
6669static PyObject *
6670unicode_startswith(PyUnicodeObject *self,
6671 PyObject *args)
6672{
6673 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006674 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006675 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 PyObject *result;
6677
Guido van Rossumb8872e62000-05-09 14:14:27 +00006678 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6679 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 return NULL;
6681 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6682 (PyObject *)substring);
6683 if (substring == NULL)
6684 return NULL;
6685
Guido van Rossum77f6a652002-04-03 22:41:51 +00006686 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687
6688 Py_DECREF(substring);
6689 return result;
6690}
6691
6692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006693PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006694"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006696Return True if S ends with the specified suffix, False otherwise.\n\
6697With optional start, test S beginning at that position.\n\
6698With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699
6700static PyObject *
6701unicode_endswith(PyUnicodeObject *self,
6702 PyObject *args)
6703{
6704 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006705 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006706 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707 PyObject *result;
6708
Guido van Rossumb8872e62000-05-09 14:14:27 +00006709 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6710 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 return NULL;
6712 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6713 (PyObject *)substring);
6714 if (substring == NULL)
6715 return NULL;
6716
Guido van Rossum77f6a652002-04-03 22:41:51 +00006717 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718
6719 Py_DECREF(substring);
6720 return result;
6721}
6722
6723
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006724
6725static PyObject *
6726unicode_getnewargs(PyUnicodeObject *v)
6727{
6728 return Py_BuildValue("(u#)", v->str, v->length);
6729}
6730
6731
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732static PyMethodDef unicode_methods[] = {
6733
6734 /* Order is according to common usage: often used methods should
6735 appear first, since lookup is done sequentially. */
6736
Georg Brandlecdc0a92006-03-30 12:19:07 +00006737 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006738 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6739 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006740 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006741 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6742 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6743 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6744 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6745 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6746 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6747 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00006748 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006749 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6750 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6751 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006752 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006753 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006754/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6755 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6756 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6757 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006758 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006759 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006760 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006761 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006762 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6763 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6764 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6765 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6766 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6767 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6768 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6769 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6770 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6771 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6772 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6773 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6774 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6775 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006776 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006777#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006778 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779#endif
6780
6781#if 0
6782 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006783 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784#endif
6785
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006786 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787 {NULL, NULL}
6788};
6789
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006790static PyObject *
6791unicode_mod(PyObject *v, PyObject *w)
6792{
6793 if (!PyUnicode_Check(v)) {
6794 Py_INCREF(Py_NotImplemented);
6795 return Py_NotImplemented;
6796 }
6797 return PyUnicode_Format(v, w);
6798}
6799
6800static PyNumberMethods unicode_as_number = {
6801 0, /*nb_add*/
6802 0, /*nb_subtract*/
6803 0, /*nb_multiply*/
6804 0, /*nb_divide*/
6805 unicode_mod, /*nb_remainder*/
6806};
6807
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006809 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00006810 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006811 (ssizeargfunc) unicode_repeat, /* sq_repeat */
6812 (ssizeargfunc) unicode_getitem, /* sq_item */
6813 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 0, /* sq_ass_item */
6815 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00006816 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817};
6818
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006819#define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
6820
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006821static PyObject*
6822unicode_subscript(PyUnicodeObject* self, PyObject* item)
6823{
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006824 PyNumberMethods *nb = item->ob_type->tp_as_number;
6825 if (nb != NULL && HASINDEX(item) && nb->nb_index != NULL) {
6826 Py_ssize_t i = nb->nb_index(item);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006827 if (i == -1 && PyErr_Occurred())
6828 return NULL;
6829 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006830 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006831 return unicode_getitem(self, i);
6832 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006833 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006834 Py_UNICODE* source_buf;
6835 Py_UNICODE* result_buf;
6836 PyObject* result;
6837
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006838 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006839 &start, &stop, &step, &slicelength) < 0) {
6840 return NULL;
6841 }
6842
6843 if (slicelength <= 0) {
6844 return PyUnicode_FromUnicode(NULL, 0);
6845 } else {
6846 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00006847 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
6848 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006849
6850 if (result_buf == NULL)
6851 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006852
6853 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6854 result_buf[i] = source_buf[cur];
6855 }
Tim Petersced69f82003-09-16 20:30:58 +00006856
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006857 result = PyUnicode_FromUnicode(result_buf, slicelength);
6858 PyMem_FREE(result_buf);
6859 return result;
6860 }
6861 } else {
6862 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6863 return NULL;
6864 }
6865}
6866
6867static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006868 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006869 (binaryfunc)unicode_subscript, /* mp_subscript */
6870 (objobjargproc)0, /* mp_ass_subscript */
6871};
6872
Martin v. Löwis18e16552006-02-15 17:27:45 +00006873static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006875 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876 const void **ptr)
6877{
6878 if (index != 0) {
6879 PyErr_SetString(PyExc_SystemError,
6880 "accessing non-existent unicode segment");
6881 return -1;
6882 }
6883 *ptr = (void *) self->str;
6884 return PyUnicode_GET_DATA_SIZE(self);
6885}
6886
Martin v. Löwis18e16552006-02-15 17:27:45 +00006887static Py_ssize_t
6888unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 const void **ptr)
6890{
6891 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006892 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 return -1;
6894}
6895
6896static int
6897unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006898 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899{
6900 if (lenp)
6901 *lenp = PyUnicode_GET_DATA_SIZE(self);
6902 return 1;
6903}
6904
Martin v. Löwiseb079f12006-02-16 14:32:27 +00006905static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006907 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908 const void **ptr)
6909{
6910 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006911
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912 if (index != 0) {
6913 PyErr_SetString(PyExc_SystemError,
6914 "accessing non-existent unicode segment");
6915 return -1;
6916 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006917 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918 if (str == NULL)
6919 return -1;
6920 *ptr = (void *) PyString_AS_STRING(str);
6921 return PyString_GET_SIZE(str);
6922}
6923
6924/* Helpers for PyUnicode_Format() */
6925
6926static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006927getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006929 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930 if (argidx < arglen) {
6931 (*p_argidx)++;
6932 if (arglen < 0)
6933 return args;
6934 else
6935 return PyTuple_GetItem(args, argidx);
6936 }
6937 PyErr_SetString(PyExc_TypeError,
6938 "not enough arguments for format string");
6939 return NULL;
6940}
6941
6942#define F_LJUST (1<<0)
6943#define F_SIGN (1<<1)
6944#define F_BLANK (1<<2)
6945#define F_ALT (1<<3)
6946#define F_ZERO (1<<4)
6947
Martin v. Löwis18e16552006-02-15 17:27:45 +00006948static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00006949strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006951 register Py_ssize_t i;
6952 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953 for (i = len - 1; i >= 0; i--)
6954 buffer[i] = (Py_UNICODE) charbuffer[i];
6955
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 return len;
6957}
6958
Neal Norwitzfc76d632006-01-10 06:03:13 +00006959static int
6960doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
6961{
Tim Peters15231542006-02-16 01:08:01 +00006962 Py_ssize_t result;
6963
Neal Norwitzfc76d632006-01-10 06:03:13 +00006964 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006965 result = strtounicode(buffer, (char *)buffer);
6966 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006967}
6968
6969static int
6970longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
6971{
Tim Peters15231542006-02-16 01:08:01 +00006972 Py_ssize_t result;
6973
Neal Norwitzfc76d632006-01-10 06:03:13 +00006974 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006975 result = strtounicode(buffer, (char *)buffer);
6976 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006977}
6978
Guido van Rossum078151d2002-08-11 04:24:12 +00006979/* XXX To save some code duplication, formatfloat/long/int could have been
6980 shared with stringobject.c, converting from 8-bit to Unicode after the
6981 formatting is done. */
6982
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983static int
6984formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006985 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986 int flags,
6987 int prec,
6988 int type,
6989 PyObject *v)
6990{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006991 /* fmt = '%#.' + `prec` + `type`
6992 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993 char fmt[20];
6994 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006995
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996 x = PyFloat_AsDouble(v);
6997 if (x == -1.0 && PyErr_Occurred())
6998 return -1;
6999 if (prec < 0)
7000 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7002 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007003 /* Worst case length calc to ensure no buffer overrun:
7004
7005 'g' formats:
7006 fmt = %#.<prec>g
7007 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7008 for any double rep.)
7009 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7010
7011 'f' formats:
7012 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7013 len = 1 + 50 + 1 + prec = 52 + prec
7014
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007015 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007016 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007017
7018 */
7019 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7020 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007021 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007022 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007023 return -1;
7024 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007025 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7026 (flags&F_ALT) ? "#" : "",
7027 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007028 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029}
7030
Tim Peters38fd5b62000-09-21 05:43:11 +00007031static PyObject*
7032formatlong(PyObject *val, int flags, int prec, int type)
7033{
7034 char *buf;
7035 int i, len;
7036 PyObject *str; /* temporary string object. */
7037 PyUnicodeObject *result;
7038
7039 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7040 if (!str)
7041 return NULL;
7042 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007043 if (!result) {
7044 Py_DECREF(str);
7045 return NULL;
7046 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007047 for (i = 0; i < len; i++)
7048 result->str[i] = buf[i];
7049 result->str[len] = 0;
7050 Py_DECREF(str);
7051 return (PyObject*)result;
7052}
7053
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054static int
7055formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007056 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057 int flags,
7058 int prec,
7059 int type,
7060 PyObject *v)
7061{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007062 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007063 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7064 * + 1 + 1
7065 * = 24
7066 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007067 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007068 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069 long x;
7070
7071 x = PyInt_AsLong(v);
7072 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007073 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007074 if (x < 0 && type == 'u') {
7075 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007076 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007077 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7078 sign = "-";
7079 else
7080 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007082 prec = 1;
7083
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007084 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7085 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007086 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007087 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007088 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007089 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007090 return -1;
7091 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007092
7093 if ((flags & F_ALT) &&
7094 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007095 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007096 * of issues that cause pain:
7097 * - when 0 is being converted, the C standard leaves off
7098 * the '0x' or '0X', which is inconsistent with other
7099 * %#x/%#X conversions and inconsistent with Python's
7100 * hex() function
7101 * - there are platforms that violate the standard and
7102 * convert 0 with the '0x' or '0X'
7103 * (Metrowerks, Compaq Tru64)
7104 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007105 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007106 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007107 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007108 * We can achieve the desired consistency by inserting our
7109 * own '0x' or '0X' prefix, and substituting %x/%X in place
7110 * of %#x/%#X.
7111 *
7112 * Note that this is the same approach as used in
7113 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007114 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007115 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7116 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007117 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007118 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007119 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7120 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007121 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007122 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007123 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007124 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007125 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007126 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127}
7128
7129static int
7130formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007131 size_t buflen,
7132 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007134 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007135 if (PyUnicode_Check(v)) {
7136 if (PyUnicode_GET_SIZE(v) != 1)
7137 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007139 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007141 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007142 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007143 goto onError;
7144 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146
7147 else {
7148 /* Integer input truncated to a character */
7149 long x;
7150 x = PyInt_AsLong(v);
7151 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007152 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007153#ifdef Py_UNICODE_WIDE
7154 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007155 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007156 "%c arg not in range(0x110000) "
7157 "(wide Python build)");
7158 return -1;
7159 }
7160#else
7161 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007162 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007163 "%c arg not in range(0x10000) "
7164 "(narrow Python build)");
7165 return -1;
7166 }
7167#endif
7168 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169 }
7170 buf[1] = '\0';
7171 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007172
7173 onError:
7174 PyErr_SetString(PyExc_TypeError,
7175 "%c requires int or char");
7176 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177}
7178
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007179/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7180
7181 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7182 chars are formatted. XXX This is a magic number. Each formatting
7183 routine does bounds checking to ensure no overflow, but a better
7184 solution may be to malloc a buffer of appropriate size for each
7185 format. For now, the current solution is sufficient.
7186*/
7187#define FORMATBUFLEN (size_t)120
7188
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189PyObject *PyUnicode_Format(PyObject *format,
7190 PyObject *args)
7191{
7192 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007193 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194 int args_owned = 0;
7195 PyUnicodeObject *result = NULL;
7196 PyObject *dict = NULL;
7197 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007198
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199 if (format == NULL || args == NULL) {
7200 PyErr_BadInternalCall();
7201 return NULL;
7202 }
7203 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007204 if (uformat == NULL)
7205 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206 fmt = PyUnicode_AS_UNICODE(uformat);
7207 fmtcnt = PyUnicode_GET_SIZE(uformat);
7208
7209 reslen = rescnt = fmtcnt + 100;
7210 result = _PyUnicode_New(reslen);
7211 if (result == NULL)
7212 goto onError;
7213 res = PyUnicode_AS_UNICODE(result);
7214
7215 if (PyTuple_Check(args)) {
7216 arglen = PyTuple_Size(args);
7217 argidx = 0;
7218 }
7219 else {
7220 arglen = -1;
7221 argidx = -2;
7222 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007223 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7224 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225 dict = args;
7226
7227 while (--fmtcnt >= 0) {
7228 if (*fmt != '%') {
7229 if (--rescnt < 0) {
7230 rescnt = fmtcnt + 100;
7231 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007232 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007233 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7235 --rescnt;
7236 }
7237 *res++ = *fmt++;
7238 }
7239 else {
7240 /* Got a format specifier */
7241 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007242 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244 Py_UNICODE c = '\0';
7245 Py_UNICODE fill;
7246 PyObject *v = NULL;
7247 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007248 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007250 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007251 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252
7253 fmt++;
7254 if (*fmt == '(') {
7255 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007256 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257 PyObject *key;
7258 int pcount = 1;
7259
7260 if (dict == NULL) {
7261 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007262 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263 goto onError;
7264 }
7265 ++fmt;
7266 --fmtcnt;
7267 keystart = fmt;
7268 /* Skip over balanced parentheses */
7269 while (pcount > 0 && --fmtcnt >= 0) {
7270 if (*fmt == ')')
7271 --pcount;
7272 else if (*fmt == '(')
7273 ++pcount;
7274 fmt++;
7275 }
7276 keylen = fmt - keystart - 1;
7277 if (fmtcnt < 0 || pcount > 0) {
7278 PyErr_SetString(PyExc_ValueError,
7279 "incomplete format key");
7280 goto onError;
7281 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007282#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007283 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284 then looked up since Python uses strings to hold
7285 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007286 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287 key = PyUnicode_EncodeUTF8(keystart,
7288 keylen,
7289 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007290#else
7291 key = PyUnicode_FromUnicode(keystart, keylen);
7292#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293 if (key == NULL)
7294 goto onError;
7295 if (args_owned) {
7296 Py_DECREF(args);
7297 args_owned = 0;
7298 }
7299 args = PyObject_GetItem(dict, key);
7300 Py_DECREF(key);
7301 if (args == NULL) {
7302 goto onError;
7303 }
7304 args_owned = 1;
7305 arglen = -1;
7306 argidx = -2;
7307 }
7308 while (--fmtcnt >= 0) {
7309 switch (c = *fmt++) {
7310 case '-': flags |= F_LJUST; continue;
7311 case '+': flags |= F_SIGN; continue;
7312 case ' ': flags |= F_BLANK; continue;
7313 case '#': flags |= F_ALT; continue;
7314 case '0': flags |= F_ZERO; continue;
7315 }
7316 break;
7317 }
7318 if (c == '*') {
7319 v = getnextarg(args, arglen, &argidx);
7320 if (v == NULL)
7321 goto onError;
7322 if (!PyInt_Check(v)) {
7323 PyErr_SetString(PyExc_TypeError,
7324 "* wants int");
7325 goto onError;
7326 }
7327 width = PyInt_AsLong(v);
7328 if (width < 0) {
7329 flags |= F_LJUST;
7330 width = -width;
7331 }
7332 if (--fmtcnt >= 0)
7333 c = *fmt++;
7334 }
7335 else if (c >= '0' && c <= '9') {
7336 width = c - '0';
7337 while (--fmtcnt >= 0) {
7338 c = *fmt++;
7339 if (c < '0' || c > '9')
7340 break;
7341 if ((width*10) / 10 != width) {
7342 PyErr_SetString(PyExc_ValueError,
7343 "width too big");
7344 goto onError;
7345 }
7346 width = width*10 + (c - '0');
7347 }
7348 }
7349 if (c == '.') {
7350 prec = 0;
7351 if (--fmtcnt >= 0)
7352 c = *fmt++;
7353 if (c == '*') {
7354 v = getnextarg(args, arglen, &argidx);
7355 if (v == NULL)
7356 goto onError;
7357 if (!PyInt_Check(v)) {
7358 PyErr_SetString(PyExc_TypeError,
7359 "* wants int");
7360 goto onError;
7361 }
7362 prec = PyInt_AsLong(v);
7363 if (prec < 0)
7364 prec = 0;
7365 if (--fmtcnt >= 0)
7366 c = *fmt++;
7367 }
7368 else if (c >= '0' && c <= '9') {
7369 prec = c - '0';
7370 while (--fmtcnt >= 0) {
7371 c = Py_CHARMASK(*fmt++);
7372 if (c < '0' || c > '9')
7373 break;
7374 if ((prec*10) / 10 != prec) {
7375 PyErr_SetString(PyExc_ValueError,
7376 "prec too big");
7377 goto onError;
7378 }
7379 prec = prec*10 + (c - '0');
7380 }
7381 }
7382 } /* prec */
7383 if (fmtcnt >= 0) {
7384 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385 if (--fmtcnt >= 0)
7386 c = *fmt++;
7387 }
7388 }
7389 if (fmtcnt < 0) {
7390 PyErr_SetString(PyExc_ValueError,
7391 "incomplete format");
7392 goto onError;
7393 }
7394 if (c != '%') {
7395 v = getnextarg(args, arglen, &argidx);
7396 if (v == NULL)
7397 goto onError;
7398 }
7399 sign = 0;
7400 fill = ' ';
7401 switch (c) {
7402
7403 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007404 pbuf = formatbuf;
7405 /* presume that buffer length is at least 1 */
7406 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407 len = 1;
7408 break;
7409
7410 case 's':
7411 case 'r':
7412 if (PyUnicode_Check(v) && c == 's') {
7413 temp = v;
7414 Py_INCREF(temp);
7415 }
7416 else {
7417 PyObject *unicode;
7418 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007419 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 else
7421 temp = PyObject_Repr(v);
7422 if (temp == NULL)
7423 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007424 if (PyUnicode_Check(temp))
7425 /* nothing to do */;
7426 else if (PyString_Check(temp)) {
7427 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007428 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007430 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007432 Py_DECREF(temp);
7433 temp = unicode;
7434 if (temp == NULL)
7435 goto onError;
7436 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007437 else {
7438 Py_DECREF(temp);
7439 PyErr_SetString(PyExc_TypeError,
7440 "%s argument has non-string str()");
7441 goto onError;
7442 }
7443 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007444 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445 len = PyUnicode_GET_SIZE(temp);
7446 if (prec >= 0 && len > prec)
7447 len = prec;
7448 break;
7449
7450 case 'i':
7451 case 'd':
7452 case 'u':
7453 case 'o':
7454 case 'x':
7455 case 'X':
7456 if (c == 'i')
7457 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007458 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007459 temp = formatlong(v, flags, prec, c);
7460 if (!temp)
7461 goto onError;
7462 pbuf = PyUnicode_AS_UNICODE(temp);
7463 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007464 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007466 else {
7467 pbuf = formatbuf;
7468 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7469 flags, prec, c, v);
7470 if (len < 0)
7471 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007472 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007473 }
7474 if (flags & F_ZERO)
7475 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476 break;
7477
7478 case 'e':
7479 case 'E':
7480 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007481 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482 case 'g':
7483 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007484 if (c == 'F')
7485 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007486 pbuf = formatbuf;
7487 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7488 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489 if (len < 0)
7490 goto onError;
7491 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007492 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493 fill = '0';
7494 break;
7495
7496 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007497 pbuf = formatbuf;
7498 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499 if (len < 0)
7500 goto onError;
7501 break;
7502
7503 default:
7504 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007505 "unsupported format character '%c' (0x%x) "
7506 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007507 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007508 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007509 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510 goto onError;
7511 }
7512 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007513 if (*pbuf == '-' || *pbuf == '+') {
7514 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515 len--;
7516 }
7517 else if (flags & F_SIGN)
7518 sign = '+';
7519 else if (flags & F_BLANK)
7520 sign = ' ';
7521 else
7522 sign = 0;
7523 }
7524 if (width < len)
7525 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007526 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527 reslen -= rescnt;
7528 rescnt = width + fmtcnt + 100;
7529 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007530 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007531 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007532 PyErr_NoMemory();
7533 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007534 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007535 if (_PyUnicode_Resize(&result, reslen) < 0) {
7536 Py_XDECREF(temp);
7537 goto onError;
7538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539 res = PyUnicode_AS_UNICODE(result)
7540 + reslen - rescnt;
7541 }
7542 if (sign) {
7543 if (fill != ' ')
7544 *res++ = sign;
7545 rescnt--;
7546 if (width > len)
7547 width--;
7548 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007549 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7550 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007551 assert(pbuf[1] == c);
7552 if (fill != ' ') {
7553 *res++ = *pbuf++;
7554 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007555 }
Tim Petersfff53252001-04-12 18:38:48 +00007556 rescnt -= 2;
7557 width -= 2;
7558 if (width < 0)
7559 width = 0;
7560 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562 if (width > len && !(flags & F_LJUST)) {
7563 do {
7564 --rescnt;
7565 *res++ = fill;
7566 } while (--width > len);
7567 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007568 if (fill == ' ') {
7569 if (sign)
7570 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007571 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007572 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007573 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007574 *res++ = *pbuf++;
7575 *res++ = *pbuf++;
7576 }
7577 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007578 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579 res += len;
7580 rescnt -= len;
7581 while (--width >= len) {
7582 --rescnt;
7583 *res++ = ' ';
7584 }
7585 if (dict && (argidx < arglen) && c != '%') {
7586 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007587 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007588 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589 goto onError;
7590 }
7591 Py_XDECREF(temp);
7592 } /* '%' */
7593 } /* until end */
7594 if (argidx < arglen && !dict) {
7595 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007596 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597 goto onError;
7598 }
7599
Thomas Woutersa96affe2006-03-12 00:29:36 +00007600 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7601 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602 if (args_owned) {
7603 Py_DECREF(args);
7604 }
7605 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606 return (PyObject *)result;
7607
7608 onError:
7609 Py_XDECREF(result);
7610 Py_DECREF(uformat);
7611 if (args_owned) {
7612 Py_DECREF(args);
7613 }
7614 return NULL;
7615}
7616
7617static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007618 (readbufferproc) unicode_buffer_getreadbuf,
7619 (writebufferproc) unicode_buffer_getwritebuf,
7620 (segcountproc) unicode_buffer_getsegcount,
7621 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622};
7623
Jeremy Hylton938ace62002-07-17 16:30:39 +00007624static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007625unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7626
Tim Peters6d6c1a32001-08-02 04:15:00 +00007627static PyObject *
7628unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7629{
7630 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007631 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007632 char *encoding = NULL;
7633 char *errors = NULL;
7634
Guido van Rossume023fe02001-08-30 03:12:59 +00007635 if (type != &PyUnicode_Type)
7636 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007637 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7638 kwlist, &x, &encoding, &errors))
7639 return NULL;
7640 if (x == NULL)
7641 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007642 if (encoding == NULL && errors == NULL)
7643 return PyObject_Unicode(x);
7644 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007645 return PyUnicode_FromEncodedObject(x, encoding, errors);
7646}
7647
Guido van Rossume023fe02001-08-30 03:12:59 +00007648static PyObject *
7649unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7650{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007651 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007652 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007653
7654 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7655 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7656 if (tmp == NULL)
7657 return NULL;
7658 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007659 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007660 if (pnew == NULL) {
7661 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007662 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007663 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007664 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7665 if (pnew->str == NULL) {
7666 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007667 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007668 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007669 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007670 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007671 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7672 pnew->length = n;
7673 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007674 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007675 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007676}
7677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007678PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007679"unicode(string [, encoding[, errors]]) -> object\n\
7680\n\
7681Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007682encoding defaults to the current default string encoding.\n\
7683errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007684
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685PyTypeObject PyUnicode_Type = {
7686 PyObject_HEAD_INIT(&PyType_Type)
7687 0, /* ob_size */
7688 "unicode", /* tp_name */
7689 sizeof(PyUnicodeObject), /* tp_size */
7690 0, /* tp_itemsize */
7691 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007692 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007694 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695 0, /* tp_setattr */
7696 (cmpfunc) unicode_compare, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007697 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007698 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007700 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701 (hashfunc) unicode_hash, /* tp_hash*/
7702 0, /* tp_call*/
7703 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007704 PyObject_GenericGetAttr, /* tp_getattro */
7705 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007707 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7708 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007709 unicode_doc, /* tp_doc */
7710 0, /* tp_traverse */
7711 0, /* tp_clear */
7712 0, /* tp_richcompare */
7713 0, /* tp_weaklistoffset */
7714 0, /* tp_iter */
7715 0, /* tp_iternext */
7716 unicode_methods, /* tp_methods */
7717 0, /* tp_members */
7718 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007719 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007720 0, /* tp_dict */
7721 0, /* tp_descr_get */
7722 0, /* tp_descr_set */
7723 0, /* tp_dictoffset */
7724 0, /* tp_init */
7725 0, /* tp_alloc */
7726 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007727 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728};
7729
7730/* Initialize the Unicode implementation */
7731
Thomas Wouters78890102000-07-22 19:25:51 +00007732void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007734 int i;
7735
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007736 /* XXX - move this array to unicodectype.c ? */
7737 Py_UNICODE linebreak[] = {
7738 0x000A, /* LINE FEED */
7739 0x000D, /* CARRIAGE RETURN */
7740 0x001C, /* FILE SEPARATOR */
7741 0x001D, /* GROUP SEPARATOR */
7742 0x001E, /* RECORD SEPARATOR */
7743 0x0085, /* NEXT LINE */
7744 0x2028, /* LINE SEPARATOR */
7745 0x2029, /* PARAGRAPH SEPARATOR */
7746 };
7747
Fred Drakee4315f52000-05-09 19:53:39 +00007748 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007749 unicode_freelist = NULL;
7750 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007752 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007753 for (i = 0; i < 256; i++)
7754 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007755 if (PyType_Ready(&PyUnicode_Type) < 0)
7756 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007757
7758 /* initialize the linebreak bloom filter */
7759 bloom_linebreak = make_bloom_mask(
7760 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
7761 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762}
7763
7764/* Finalize the Unicode implementation */
7765
7766void
Thomas Wouters78890102000-07-22 19:25:51 +00007767_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007769 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007770 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007772 Py_XDECREF(unicode_empty);
7773 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007774
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007775 for (i = 0; i < 256; i++) {
7776 if (unicode_latin1[i]) {
7777 Py_DECREF(unicode_latin1[i]);
7778 unicode_latin1[i] = NULL;
7779 }
7780 }
7781
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007782 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783 PyUnicodeObject *v = u;
7784 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007785 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007786 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007787 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007788 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007790 unicode_freelist = NULL;
7791 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007793
Anthony Baxterac6bd462006-04-13 02:06:09 +00007794#ifdef __cplusplus
7795}
7796#endif
7797
7798
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007799/*
7800Local variables:
7801c-basic-offset: 4
7802indent-tabs-mode: nil
7803End:
7804*/