blob: 27fedca463a49882ce996d9f7eceef306a4e6c6b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
101 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000106static PyUnicodeObject *unicode_freelist;
107static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Fred Drakee4315f52000-05-09 19:53:39 +0000116/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000117 parameter; it is fixed to "utf-8". Always use the
118 PyUnicode_GetDefaultEncoding() API to access this global. */
119static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000120
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000121Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000122PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000124#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125 return 0x10FFFF;
126#else
127 /* This is actually an illegal character, so it should
128 not be passed to unichr. */
129 return 0xFFFF;
130#endif
131}
132
Thomas Wouters477c8d52006-05-27 19:21:47 +0000133/* --- Bloom Filters ----------------------------------------------------- */
134
135/* stuff to implement simple "bloom filters" for Unicode characters.
136 to keep things simple, we use a single bitmask, using the least 5
137 bits from each unicode characters as the bit index. */
138
139/* the linebreak mask is set up by Unicode_Init below */
140
141#define BLOOM_MASK unsigned long
142
143static BLOOM_MASK bloom_linebreak;
144
145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
146
147#define BLOOM_LINEBREAK(ch)\
148 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
149
150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
151{
152 /* calculate simple bloom-style bitmask for a given unicode string */
153
154 long mask;
155 Py_ssize_t i;
156
157 mask = 0;
158 for (i = 0; i < len; i++)
159 mask |= (1 << (ptr[i] & 0x1F));
160
161 return mask;
162}
163
164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
165{
166 Py_ssize_t i;
167
168 for (i = 0; i < setlen; i++)
169 if (set[i] == chr)
170 return 1;
171
172 return 0;
173}
174
175#define BLOOM_MEMBER(mask, chr, set, setlen)\
176 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
177
Guido van Rossumd57fd912000-03-10 22:53:23 +0000178/* --- Unicode Object ----------------------------------------------------- */
179
180static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000182 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183{
184 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000185
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000186 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190 /* Resizing shared object (unicode_empty or single character
191 objects) in-place is not allowed. Use PyUnicode_Resize()
192 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000194 if (unicode == unicode_empty ||
195 (unicode->length == 1 &&
196 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 return -1;
201 }
202
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203 /* We allocate one more byte to make sure the string is Ux0000 terminated.
204 The overallocation is also used by fastsearch, which assumes that it's
205 safe to look at str[length] (without making any assumptions about what
206 it contains). */
207
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 oldstr = unicode->str;
209 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
210 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 PyErr_NoMemory();
213 return -1;
214 }
215 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000216 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000220 if (unicode->defenc) {
221 Py_DECREF(unicode->defenc);
222 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 }
224 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000225
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 return 0;
227}
228
229/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000230 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231
232 XXX This allocator could further be enhanced by assuring that the
233 free list never reduces its size below 1.
234
235*/
236
237static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000238PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239{
240 register PyUnicodeObject *unicode;
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (length == 0 && unicode_empty != NULL) {
244 Py_INCREF(unicode_empty);
245 return unicode_empty;
246 }
247
248 /* Unicode freelist & memory allocation */
249 if (unicode_freelist) {
250 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000251 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 /* Keep-Alive optimization: we only upsize the buffer,
255 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000256 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000257 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000258 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000259 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000262 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000264 }
265 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 }
267 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000268 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 if (unicode == NULL)
270 return NULL;
271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
272 }
273
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000274 if (!unicode->str) {
275 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000276 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000278 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000279 * the caller fails before initializing str -- unicode_resize()
280 * reads str[0], and the Keep-Alive optimization can keep memory
281 * allocated for str alive across a call to unicode_dealloc(unicode).
282 * We don't want unicode_resize to read uninitialized memory in
283 * that case.
284 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000285 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000289 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000292
293 onError:
294 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000295 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297}
298
299static
Guido van Rossum9475a232001-10-05 20:51:39 +0000300void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
Walter Dörwald16807132007-05-25 13:52:07 +0000302 switch (PyUnicode_CHECK_INTERNED(unicode)) {
303 case SSTATE_NOT_INTERNED:
304 break;
305
306 case SSTATE_INTERNED_MORTAL:
307 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000308 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000309 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
310 Py_FatalError(
311 "deletion of interned unicode string failed");
312 break;
313
314 case SSTATE_INTERNED_IMMORTAL:
315 Py_FatalError("Immortal interned unicode string died.");
316
317 default:
318 Py_FatalError("Inconsistent interned unicode string state.");
319 }
320
Guido van Rossum604ddf82001-12-06 20:03:56 +0000321 if (PyUnicode_CheckExact(unicode) &&
322 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000323 /* Keep-Alive optimization */
324 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000325 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 unicode->str = NULL;
327 unicode->length = 0;
328 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000329 if (unicode->defenc) {
330 Py_DECREF(unicode->defenc);
331 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000332 }
333 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 *(PyUnicodeObject **)unicode = unicode_freelist;
335 unicode_freelist = unicode;
336 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 }
338 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000339 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000340 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000341 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 }
343}
344
Martin v. Löwis18e16552006-02-15 17:27:45 +0000345int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000346{
347 register PyUnicodeObject *v;
348
349 /* Argument checks */
350 if (unicode == NULL) {
351 PyErr_BadInternalCall();
352 return -1;
353 }
354 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000355 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000356 PyErr_BadInternalCall();
357 return -1;
358 }
359
360 /* Resizing unicode_empty and single character objects is not
361 possible since these are being shared. We simply return a fresh
362 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000363 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 (v == unicode_empty || v->length == 1)) {
365 PyUnicodeObject *w = _PyUnicode_New(length);
366 if (w == NULL)
367 return -1;
368 Py_UNICODE_COPY(w->str, v->str,
369 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000370 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000371 *unicode = (PyObject *)w;
372 return 0;
373 }
374
375 /* Note that we don't have to modify *unicode for unshared Unicode
376 objects, since we can modify them in-place. */
377 return unicode_resize(v, length);
378}
379
380/* Internal API for use in unicodeobject.c only ! */
381#define _PyUnicode_Resize(unicodevar, length) \
382 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
383
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000385 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386{
387 PyUnicodeObject *unicode;
388
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000389 /* If the Unicode data is known at construction time, we can apply
390 some optimizations which share commonly used objects. */
391 if (u != NULL) {
392
393 /* Optimization for empty strings */
394 if (size == 0 && unicode_empty != NULL) {
395 Py_INCREF(unicode_empty);
396 return (PyObject *)unicode_empty;
397 }
398
399 /* Single character Unicode objects in the Latin-1 range are
400 shared when using this constructor */
401 if (size == 1 && *u < 256) {
402 unicode = unicode_latin1[*u];
403 if (!unicode) {
404 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000405 if (!unicode)
406 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000407 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 unicode_latin1[*u] = unicode;
409 }
410 Py_INCREF(unicode);
411 return (PyObject *)unicode;
412 }
413 }
Tim Petersced69f82003-09-16 20:30:58 +0000414
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 unicode = _PyUnicode_New(size);
416 if (!unicode)
417 return NULL;
418
419 /* Copy the Unicode data into the new object */
420 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000421 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000422
423 return (PyObject *)unicode;
424}
425
Walter Dörwaldd2034312007-05-18 16:29:38 +0000426PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000427{
428 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000429 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000430 some optimizations which share commonly used objects.
431 Also, this means the input must be UTF-8, so fall back to the
432 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000433 if (u != NULL) {
434
435 /* Optimization for empty strings */
436 if (size == 0 && unicode_empty != NULL) {
437 Py_INCREF(unicode_empty);
438 return (PyObject *)unicode_empty;
439 }
440
Martin v. Löwis9c121062007-08-05 20:26:11 +0000441 /* Single characters are shared when using this constructor.
442 Restrict to ASCII, since the input must be UTF-8. */
443 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000444 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000445 if (!unicode) {
446 unicode = _PyUnicode_New(1);
447 if (!unicode)
448 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000449 unicode->str[0] = Py_CHARMASK(*u);
450 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000451 }
452 Py_INCREF(unicode);
453 return (PyObject *)unicode;
454 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000455
456 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000457 }
458
Walter Dörwald55507312007-05-18 13:12:10 +0000459 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000460 if (!unicode)
461 return NULL;
462
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000463 return (PyObject *)unicode;
464}
465
Walter Dörwaldd2034312007-05-18 16:29:38 +0000466PyObject *PyUnicode_FromString(const char *u)
467{
468 size_t size = strlen(u);
469 if (size > PY_SSIZE_T_MAX) {
470 PyErr_SetString(PyExc_OverflowError, "input too long");
471 return NULL;
472 }
473
474 return PyUnicode_FromStringAndSize(u, size);
475}
476
Guido van Rossumd57fd912000-03-10 22:53:23 +0000477#ifdef HAVE_WCHAR_H
478
479PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000480 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000481{
482 PyUnicodeObject *unicode;
483
484 if (w == NULL) {
485 PyErr_BadInternalCall();
486 return NULL;
487 }
488
489 unicode = _PyUnicode_New(size);
490 if (!unicode)
491 return NULL;
492
493 /* Copy the wchar_t data into the new object */
494#ifdef HAVE_USABLE_WCHAR_T
495 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000496#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497 {
498 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000499 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000501 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000502 *u++ = *w++;
503 }
504#endif
505
506 return (PyObject *)unicode;
507}
508
Walter Dörwald346737f2007-05-31 10:44:43 +0000509static void
510makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
511{
512 *fmt++ = '%';
513 if (width) {
514 if (zeropad)
515 *fmt++ = '0';
516 fmt += sprintf(fmt, "%d", width);
517 }
518 if (precision)
519 fmt += sprintf(fmt, ".%d", precision);
520 if (longflag)
521 *fmt++ = 'l';
522 else if (size_tflag) {
523 char *f = PY_FORMAT_SIZE_T;
524 while (*f)
525 *fmt++ = *f++;
526 }
527 *fmt++ = c;
528 *fmt = '\0';
529}
530
Walter Dörwaldd2034312007-05-18 16:29:38 +0000531#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
532
533PyObject *
534PyUnicode_FromFormatV(const char *format, va_list vargs)
535{
536 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000537 Py_ssize_t callcount = 0;
538 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000539 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000540 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000541 int width = 0;
542 int precision = 0;
543 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000544 const char* f;
545 Py_UNICODE *s;
546 PyObject *string;
547 /* used by sprintf */
548 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000549 /* use abuffer instead of buffer, if we need more space
550 * (which can happen if there's a format specifier with width). */
551 char *abuffer = NULL;
552 char *realbuffer;
553 Py_ssize_t abuffersize = 0;
554 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000555 const char *copy;
556
557#ifdef VA_LIST_IS_ARRAY
558 Py_MEMCPY(count, vargs, sizeof(va_list));
559#else
560#ifdef __va_copy
561 __va_copy(count, vargs);
562#else
563 count = vargs;
564#endif
565#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000566 /* step 1: count the number of %S/%R format specifications
567 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
568 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000569 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000570 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000571 ++callcount;
572 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000573 /* step 2: allocate memory for the results of
574 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000575 if (callcount) {
576 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
577 if (!callresults) {
578 PyErr_NoMemory();
579 return NULL;
580 }
581 callresult = callresults;
582 }
583 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000584 for (f = format; *f; f++) {
585 if (*f == '%') {
586 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000587 width = 0;
588 while (isdigit(Py_CHARMASK(*f)))
589 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000590 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
591 ;
592
593 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
594 * they don't affect the amount of space we reserve.
595 */
596 if ((*f == 'l' || *f == 'z') &&
597 (f[1] == 'd' || f[1] == 'u'))
598 ++f;
599
600 switch (*f) {
601 case 'c':
602 (void)va_arg(count, int);
603 /* fall through... */
604 case '%':
605 n++;
606 break;
607 case 'd': case 'u': case 'i': case 'x':
608 (void) va_arg(count, int);
609 /* 20 bytes is enough to hold a 64-bit
610 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000611 This isn't enough for octal.
612 If a width is specified we need more
613 (which we allocate later). */
614 if (width < 20)
615 width = 20;
616 n += width;
617 if (abuffersize < width)
618 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000619 break;
620 case 's':
621 n += strlen(va_arg(count, char*));
622 break;
623 case 'U':
624 {
625 PyObject *obj = va_arg(count, PyObject *);
626 assert(obj && PyUnicode_Check(obj));
627 n += PyUnicode_GET_SIZE(obj);
628 break;
629 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000630 case 'V':
631 {
632 PyObject *obj = va_arg(count, PyObject *);
633 const char *str = va_arg(count, const char *);
634 assert(obj || str);
635 assert(!obj || PyUnicode_Check(obj));
636 if (obj)
637 n += PyUnicode_GET_SIZE(obj);
638 else
639 n += strlen(str);
640 break;
641 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000642 case 'S':
643 {
644 PyObject *obj = va_arg(count, PyObject *);
645 PyObject *str;
646 assert(obj);
647 str = PyObject_Unicode(obj);
648 if (!str)
649 goto fail;
650 n += PyUnicode_GET_SIZE(str);
651 /* Remember the str and switch to the next slot */
652 *callresult++ = str;
653 break;
654 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000655 case 'R':
656 {
657 PyObject *obj = va_arg(count, PyObject *);
658 PyObject *repr;
659 assert(obj);
660 repr = PyObject_Repr(obj);
661 if (!repr)
662 goto fail;
663 n += PyUnicode_GET_SIZE(repr);
664 /* Remember the repr and switch to the next slot */
665 *callresult++ = repr;
666 break;
667 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000668 case 'p':
669 (void) va_arg(count, int);
670 /* maximum 64-bit pointer representation:
671 * 0xffffffffffffffff
672 * so 19 characters is enough.
673 * XXX I count 18 -- what's the extra for?
674 */
675 n += 19;
676 break;
677 default:
678 /* if we stumble upon an unknown
679 formatting code, copy the rest of
680 the format string to the output
681 string. (we cannot just skip the
682 code, since there's no way to know
683 what's in the argument list) */
684 n += strlen(p);
685 goto expand;
686 }
687 } else
688 n++;
689 }
690 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000691 if (abuffersize > 20) {
692 abuffer = PyMem_Malloc(abuffersize);
693 if (!abuffer) {
694 PyErr_NoMemory();
695 goto fail;
696 }
697 realbuffer = abuffer;
698 }
699 else
700 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000701 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000702 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000703 we don't have to resize the string.
704 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000705 string = PyUnicode_FromUnicode(NULL, n);
706 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000707 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000708
709 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000710 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000711
712 for (f = format; *f; f++) {
713 if (*f == '%') {
714 const char* p = f++;
715 int longflag = 0;
716 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000717 zeropad = (*f == '0');
718 /* parse the width.precision part */
719 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000720 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000721 width = (width*10) + *f++ - '0';
722 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000723 if (*f == '.') {
724 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000725 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000726 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000727 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728 /* handle the long flag, but only for %ld and %lu.
729 others can be added when necessary. */
730 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
731 longflag = 1;
732 ++f;
733 }
734 /* handle the size_t flag. */
735 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
736 size_tflag = 1;
737 ++f;
738 }
739
740 switch (*f) {
741 case 'c':
742 *s++ = va_arg(vargs, int);
743 break;
744 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000745 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000746 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000747 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000748 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000749 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000750 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000751 sprintf(realbuffer, fmt, va_arg(vargs, int));
752 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000753 break;
754 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000755 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000756 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000757 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000758 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000759 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000760 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000761 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
762 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763 break;
764 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000765 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
766 sprintf(realbuffer, fmt, va_arg(vargs, int));
767 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000768 break;
769 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000770 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
771 sprintf(realbuffer, fmt, va_arg(vargs, int));
772 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000773 break;
774 case 's':
775 p = va_arg(vargs, char*);
776 appendstring(p);
777 break;
778 case 'U':
779 {
780 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000781 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
782 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
783 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000784 break;
785 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000786 case 'V':
787 {
788 PyObject *obj = va_arg(vargs, PyObject *);
789 const char *str = va_arg(vargs, const char *);
790 if (obj) {
791 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
792 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
793 s += size;
794 } else {
795 appendstring(str);
796 }
797 break;
798 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000799 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000800 case 'R':
801 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000802 Py_UNICODE *ucopy;
803 Py_ssize_t usize;
804 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000805 /* unused, since we already have the result */
806 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000807 ucopy = PyUnicode_AS_UNICODE(*callresult);
808 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000809 for (upos = 0; upos<usize;)
810 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000811 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000812 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000813 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000814 ++callresult;
815 break;
816 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000817 case 'p':
818 sprintf(buffer, "%p", va_arg(vargs, void*));
819 /* %p is ill-defined: ensure leading 0x. */
820 if (buffer[1] == 'X')
821 buffer[1] = 'x';
822 else if (buffer[1] != 'x') {
823 memmove(buffer+2, buffer, strlen(buffer)+1);
824 buffer[0] = '0';
825 buffer[1] = 'x';
826 }
827 appendstring(buffer);
828 break;
829 case '%':
830 *s++ = '%';
831 break;
832 default:
833 appendstring(p);
834 goto end;
835 }
836 } else
837 *s++ = *f;
838 }
839
840 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000841 if (callresults)
842 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000843 if (abuffer)
844 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000845 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
846 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000847 fail:
848 if (callresults) {
849 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000850 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000851 Py_DECREF(*callresult2);
852 ++callresult2;
853 }
854 PyMem_Free(callresults);
855 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000856 if (abuffer)
857 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000858 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000859}
860
861#undef appendstring
862
863PyObject *
864PyUnicode_FromFormat(const char *format, ...)
865{
866 PyObject* ret;
867 va_list vargs;
868
869#ifdef HAVE_STDARG_PROTOTYPES
870 va_start(vargs, format);
871#else
872 va_start(vargs);
873#endif
874 ret = PyUnicode_FromFormatV(format, vargs);
875 va_end(vargs);
876 return ret;
877}
878
Martin v. Löwis18e16552006-02-15 17:27:45 +0000879Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
880 wchar_t *w,
881 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000882{
883 if (unicode == NULL) {
884 PyErr_BadInternalCall();
885 return -1;
886 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000887
888 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000889 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000890 size = PyUnicode_GET_SIZE(unicode) + 1;
891
Guido van Rossumd57fd912000-03-10 22:53:23 +0000892#ifdef HAVE_USABLE_WCHAR_T
893 memcpy(w, unicode->str, size * sizeof(wchar_t));
894#else
895 {
896 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000897 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000898 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000899 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000900 *w++ = *u++;
901 }
902#endif
903
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000904 if (size > PyUnicode_GET_SIZE(unicode))
905 return PyUnicode_GET_SIZE(unicode);
906 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000907 return size;
908}
909
910#endif
911
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000912PyObject *PyUnicode_FromOrdinal(int ordinal)
913{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000914 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000915
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000916 if (ordinal < 0 || ordinal > 0x10ffff) {
917 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000918 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000919 return NULL;
920 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000921
922#ifndef Py_UNICODE_WIDE
923 if (ordinal > 0xffff) {
924 ordinal -= 0x10000;
925 s[0] = 0xD800 | (ordinal >> 10);
926 s[1] = 0xDC00 | (ordinal & 0x3FF);
927 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000928 }
929#endif
930
Hye-Shik Chang40574832004-04-06 07:24:51 +0000931 s[0] = (Py_UNICODE)ordinal;
932 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000933}
934
Guido van Rossumd57fd912000-03-10 22:53:23 +0000935PyObject *PyUnicode_FromObject(register PyObject *obj)
936{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000937 /* XXX Perhaps we should make this API an alias of
938 PyObject_Unicode() instead ?! */
939 if (PyUnicode_CheckExact(obj)) {
940 Py_INCREF(obj);
941 return obj;
942 }
943 if (PyUnicode_Check(obj)) {
944 /* For a Unicode subtype that's not a Unicode object,
945 return a true Unicode object with the same data. */
946 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
947 PyUnicode_GET_SIZE(obj));
948 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000949 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
950}
951
952PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
953 const char *encoding,
954 const char *errors)
955{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000956 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000957 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000958 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000959
Guido van Rossumd57fd912000-03-10 22:53:23 +0000960 if (obj == NULL) {
961 PyErr_BadInternalCall();
962 return NULL;
963 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000964
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000965#if 0
966 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000967 that no encodings is given and then redirect to
968 PyObject_Unicode() which then applies the additional logic for
969 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000970
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000971 NOTE: This API should really only be used for object which
972 represent *encoded* Unicode !
973
974 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000975 if (PyUnicode_Check(obj)) {
976 if (encoding) {
977 PyErr_SetString(PyExc_TypeError,
978 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000979 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000980 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000981 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000982 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000983#else
984 if (PyUnicode_Check(obj)) {
985 PyErr_SetString(PyExc_TypeError,
986 "decoding Unicode is not supported");
987 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000988 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000989#endif
990
991 /* Coerce object */
992 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000993 s = PyString_AS_STRING(obj);
994 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000995 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000996 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
997 /* Overwrite the error message with something more useful in
998 case of a TypeError. */
999 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001000 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001001 "coercing to Unicode: need string or buffer, "
1002 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001003 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001004 goto onError;
1005 }
Tim Petersced69f82003-09-16 20:30:58 +00001006
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001007 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008 if (len == 0) {
1009 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001010 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001011 }
Tim Petersced69f82003-09-16 20:30:58 +00001012 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001013 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001014
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001015 return v;
1016
1017 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001018 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001019}
1020
1021PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001022 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023 const char *encoding,
1024 const char *errors)
1025{
1026 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001027
1028 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001029 encoding = PyUnicode_GetDefaultEncoding();
1030
1031 /* Shortcuts for common default encodings */
1032 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001034 else if (strcmp(encoding, "latin-1") == 0)
1035 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001036#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1037 else if (strcmp(encoding, "mbcs") == 0)
1038 return PyUnicode_DecodeMBCS(s, size, errors);
1039#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001040 else if (strcmp(encoding, "ascii") == 0)
1041 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042
1043 /* Decode via the codec registry */
1044 buffer = PyBuffer_FromMemory((void *)s, size);
1045 if (buffer == NULL)
1046 goto onError;
1047 unicode = PyCodec_Decode(buffer, encoding, errors);
1048 if (unicode == NULL)
1049 goto onError;
1050 if (!PyUnicode_Check(unicode)) {
1051 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001052 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001053 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001054 Py_DECREF(unicode);
1055 goto onError;
1056 }
1057 Py_DECREF(buffer);
1058 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001059
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 onError:
1061 Py_XDECREF(buffer);
1062 return NULL;
1063}
1064
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001065PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1066 const char *encoding,
1067 const char *errors)
1068{
1069 PyObject *v;
1070
1071 if (!PyUnicode_Check(unicode)) {
1072 PyErr_BadArgument();
1073 goto onError;
1074 }
1075
1076 if (encoding == NULL)
1077 encoding = PyUnicode_GetDefaultEncoding();
1078
1079 /* Decode via the codec registry */
1080 v = PyCodec_Decode(unicode, encoding, errors);
1081 if (v == NULL)
1082 goto onError;
1083 return v;
1084
1085 onError:
1086 return NULL;
1087}
1088
Guido van Rossumd57fd912000-03-10 22:53:23 +00001089PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001090 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 const char *encoding,
1092 const char *errors)
1093{
1094 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001095
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 unicode = PyUnicode_FromUnicode(s, size);
1097 if (unicode == NULL)
1098 return NULL;
1099 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1100 Py_DECREF(unicode);
1101 return v;
1102}
1103
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001104PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1105 const char *encoding,
1106 const char *errors)
1107{
1108 PyObject *v;
1109
1110 if (!PyUnicode_Check(unicode)) {
1111 PyErr_BadArgument();
1112 goto onError;
1113 }
1114
1115 if (encoding == NULL)
1116 encoding = PyUnicode_GetDefaultEncoding();
1117
1118 /* Encode via the codec registry */
1119 v = PyCodec_Encode(unicode, encoding, errors);
1120 if (v == NULL)
1121 goto onError;
1122 return v;
1123
1124 onError:
1125 return NULL;
1126}
1127
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1129 const char *encoding,
1130 const char *errors)
1131{
1132 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001133
Guido van Rossumd57fd912000-03-10 22:53:23 +00001134 if (!PyUnicode_Check(unicode)) {
1135 PyErr_BadArgument();
1136 goto onError;
1137 }
Fred Drakee4315f52000-05-09 19:53:39 +00001138
Tim Petersced69f82003-09-16 20:30:58 +00001139 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001140 encoding = PyUnicode_GetDefaultEncoding();
1141
1142 /* Shortcuts for common default encodings */
1143 if (errors == NULL) {
1144 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001145 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001146 else if (strcmp(encoding, "latin-1") == 0)
1147 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001148#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1149 else if (strcmp(encoding, "mbcs") == 0)
1150 return PyUnicode_AsMBCSString(unicode);
1151#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001152 else if (strcmp(encoding, "ascii") == 0)
1153 return PyUnicode_AsASCIIString(unicode);
1154 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001155
1156 /* Encode via the codec registry */
1157 v = PyCodec_Encode(unicode, encoding, errors);
1158 if (v == NULL)
1159 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001160 if (!PyBytes_Check(v)) {
1161 if (PyString_Check(v)) {
1162 /* Old codec, turn it into bytes */
1163 PyObject *b = PyBytes_FromObject(v);
1164 Py_DECREF(v);
1165 return b;
1166 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001168 "encoder did not return a bytes object "
1169 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1170 v->ob_type->tp_name,
1171 encoding ? encoding : "NULL",
1172 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173 Py_DECREF(v);
1174 goto onError;
1175 }
1176 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001177
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178 onError:
1179 return NULL;
1180}
1181
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001182PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1183 const char *errors)
1184{
1185 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001186 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001187 if (v)
1188 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001189 if (errors != NULL)
1190 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1191 if (errors == NULL) {
1192 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1193 PyUnicode_GET_SIZE(unicode),
1194 NULL);
1195 }
1196 else {
1197 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1198 }
1199 if (!b)
1200 return NULL;
1201 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1202 PyBytes_Size(b));
1203 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001204 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001205 return v;
1206}
1207
Martin v. Löwis5b222132007-06-10 09:51:05 +00001208char*
1209PyUnicode_AsString(PyObject *unicode)
1210{
1211 assert(PyUnicode_Check(unicode));
1212 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1213 if (!unicode)
1214 return NULL;
1215 return PyString_AsString(unicode);
1216}
1217
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1219{
1220 if (!PyUnicode_Check(unicode)) {
1221 PyErr_BadArgument();
1222 goto onError;
1223 }
1224 return PyUnicode_AS_UNICODE(unicode);
1225
1226 onError:
1227 return NULL;
1228}
1229
Martin v. Löwis18e16552006-02-15 17:27:45 +00001230Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231{
1232 if (!PyUnicode_Check(unicode)) {
1233 PyErr_BadArgument();
1234 goto onError;
1235 }
1236 return PyUnicode_GET_SIZE(unicode);
1237
1238 onError:
1239 return -1;
1240}
1241
Thomas Wouters78890102000-07-22 19:25:51 +00001242const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001243{
1244 return unicode_default_encoding;
1245}
1246
1247int PyUnicode_SetDefaultEncoding(const char *encoding)
1248{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001249 if (strcmp(encoding, unicode_default_encoding) != 0) {
1250 PyErr_Format(PyExc_ValueError,
1251 "Can only set default encoding to %s",
1252 unicode_default_encoding);
1253 return -1;
1254 }
Fred Drakee4315f52000-05-09 19:53:39 +00001255 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001256}
1257
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001258/* error handling callback helper:
1259 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001260 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001261 and adjust various state variables.
1262 return 0 on success, -1 on error
1263*/
1264
1265static
1266int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1267 const char *encoding, const char *reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001268 const char **input, const char **inend, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001269 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001270{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001271 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001272
1273 PyObject *restuple = NULL;
1274 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001275 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001276 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001277 Py_ssize_t requiredsize;
1278 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001279 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001280 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001281 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001282 int res = -1;
1283
1284 if (*errorHandler == NULL) {
1285 *errorHandler = PyCodec_LookupError(errors);
1286 if (*errorHandler == NULL)
1287 goto onError;
1288 }
1289
1290 if (*exceptionObject == NULL) {
1291 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001292 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001293 if (*exceptionObject == NULL)
1294 goto onError;
1295 }
1296 else {
1297 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1298 goto onError;
1299 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1300 goto onError;
1301 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1302 goto onError;
1303 }
1304
1305 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1306 if (restuple == NULL)
1307 goto onError;
1308 if (!PyTuple_Check(restuple)) {
1309 PyErr_Format(PyExc_TypeError, &argparse[4]);
1310 goto onError;
1311 }
1312 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1313 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001314
1315 /* Copy back the bytes variables, which might have been modified by the
1316 callback */
1317 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1318 if (!inputobj)
1319 goto onError;
1320 if (!PyBytes_Check(inputobj)) {
1321 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1322 }
1323 *input = PyBytes_AS_STRING(inputobj);
1324 insize = PyBytes_GET_SIZE(inputobj);
1325 *inend = *input + insize;
1326
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001327 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001328 newpos = insize+newpos;
1329 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001330 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001331 goto onError;
1332 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001333
1334 /* need more space? (at least enough for what we
1335 have+the replacement+the rest of the string (starting
1336 at the new input position), so we won't have to check space
1337 when there are no errors in the rest of the string) */
1338 repptr = PyUnicode_AS_UNICODE(repunicode);
1339 repsize = PyUnicode_GET_SIZE(repunicode);
1340 requiredsize = *outpos + repsize + insize-newpos;
1341 if (requiredsize > outsize) {
1342 if (requiredsize<2*outsize)
1343 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001344 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001345 goto onError;
1346 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1347 }
1348 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001349 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001350 Py_UNICODE_COPY(*outptr, repptr, repsize);
1351 *outptr += repsize;
1352 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001353
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001354 /* we made it! */
1355 res = 0;
1356
1357 onError:
1358 Py_XDECREF(restuple);
1359 return res;
1360}
1361
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001362/* --- UTF-7 Codec -------------------------------------------------------- */
1363
1364/* see RFC2152 for details */
1365
Tim Petersced69f82003-09-16 20:30:58 +00001366static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001367char utf7_special[128] = {
1368 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1369 encoded:
1370 0 - not special
1371 1 - special
1372 2 - whitespace (optional)
1373 3 - RFC2152 Set O (optional) */
1374 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1375 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1376 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1377 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1378 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1379 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1380 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1381 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1382
1383};
1384
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001385/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1386 warnings about the comparison always being false; since
1387 utf7_special[0] is 1, we can safely make that one comparison
1388 true */
1389
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001390#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001391 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001392 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001393 (encodeO && (utf7_special[(c)] == 3)))
1394
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001395#define B64(n) \
1396 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1397#define B64CHAR(c) \
1398 (isalnum(c) || (c) == '+' || (c) == '/')
1399#define UB64(c) \
1400 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1401 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001402
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001403#define ENCODE(out, ch, bits) \
1404 while (bits >= 6) { \
1405 *out++ = B64(ch >> (bits-6)); \
1406 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001407 }
1408
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001409#define DECODE(out, ch, bits, surrogate) \
1410 while (bits >= 16) { \
1411 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1412 bits -= 16; \
1413 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001414 /* We have already generated an error for the high surrogate \
1415 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001416 surrogate = 0; \
1417 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001418 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001419 it in a 16-bit character */ \
1420 surrogate = 1; \
1421 errmsg = "code pairs are not supported"; \
1422 goto utf7Error; \
1423 } else { \
1424 *out++ = outCh; \
1425 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001426 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001427
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001428PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001429 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001430 const char *errors)
1431{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001432 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001433 Py_ssize_t startinpos;
1434 Py_ssize_t endinpos;
1435 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001436 const char *e;
1437 PyUnicodeObject *unicode;
1438 Py_UNICODE *p;
1439 const char *errmsg = "";
1440 int inShift = 0;
1441 unsigned int bitsleft = 0;
1442 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001443 int surrogate = 0;
1444 PyObject *errorHandler = NULL;
1445 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001446
1447 unicode = _PyUnicode_New(size);
1448 if (!unicode)
1449 return NULL;
1450 if (size == 0)
1451 return (PyObject *)unicode;
1452
1453 p = unicode->str;
1454 e = s + size;
1455
1456 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001457 Py_UNICODE ch;
1458 restart:
1459 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001460
1461 if (inShift) {
1462 if ((ch == '-') || !B64CHAR(ch)) {
1463 inShift = 0;
1464 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001465
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001466 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1467 if (bitsleft >= 6) {
1468 /* The shift sequence has a partial character in it. If
1469 bitsleft < 6 then we could just classify it as padding
1470 but that is not the case here */
1471
1472 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001473 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001474 }
1475 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001476 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001477 here so indicate the potential of a misencoded character. */
1478
1479 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1480 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1481 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001482 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001483 }
1484
1485 if (ch == '-') {
1486 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001487 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001488 inShift = 1;
1489 }
1490 } else if (SPECIAL(ch,0,0)) {
1491 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001492 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001493 } else {
1494 *p++ = ch;
1495 }
1496 } else {
1497 charsleft = (charsleft << 6) | UB64(ch);
1498 bitsleft += 6;
1499 s++;
1500 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1501 }
1502 }
1503 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001504 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001505 s++;
1506 if (s < e && *s == '-') {
1507 s++;
1508 *p++ = '+';
1509 } else
1510 {
1511 inShift = 1;
1512 bitsleft = 0;
1513 }
1514 }
1515 else if (SPECIAL(ch,0,0)) {
1516 errmsg = "unexpected special character";
1517 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001518 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001519 }
1520 else {
1521 *p++ = ch;
1522 s++;
1523 }
1524 continue;
1525 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001526 outpos = p-PyUnicode_AS_UNICODE(unicode);
1527 endinpos = s-starts;
1528 if (unicode_decode_call_errorhandler(
1529 errors, &errorHandler,
1530 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001531 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001532 (PyObject **)&unicode, &outpos, &p))
1533 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001534 }
1535
1536 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001537 outpos = p-PyUnicode_AS_UNICODE(unicode);
1538 endinpos = size;
1539 if (unicode_decode_call_errorhandler(
1540 errors, &errorHandler,
1541 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001542 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001543 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001544 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001545 if (s < e)
1546 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001547 }
1548
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001549 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001550 goto onError;
1551
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001552 Py_XDECREF(errorHandler);
1553 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001554 return (PyObject *)unicode;
1555
1556onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001557 Py_XDECREF(errorHandler);
1558 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001559 Py_DECREF(unicode);
1560 return NULL;
1561}
1562
1563
1564PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001565 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001566 int encodeSetO,
1567 int encodeWhiteSpace,
1568 const char *errors)
1569{
1570 PyObject *v;
1571 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001572 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001573 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001574 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001575 unsigned int bitsleft = 0;
1576 unsigned long charsleft = 0;
1577 char * out;
1578 char * start;
1579
1580 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001581 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001582
Walter Dörwald51ab4142007-05-05 14:43:36 +00001583 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001584 if (v == NULL)
1585 return NULL;
1586
Walter Dörwald51ab4142007-05-05 14:43:36 +00001587 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001588 for (;i < size; ++i) {
1589 Py_UNICODE ch = s[i];
1590
1591 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001592 if (ch == '+') {
1593 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594 *out++ = '-';
1595 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1596 charsleft = ch;
1597 bitsleft = 16;
1598 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001599 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001600 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001601 } else {
1602 *out++ = (char) ch;
1603 }
1604 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001605 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1606 *out++ = B64(charsleft << (6-bitsleft));
1607 charsleft = 0;
1608 bitsleft = 0;
1609 /* Characters not in the BASE64 set implicitly unshift the sequence
1610 so no '-' is required, except if the character is itself a '-' */
1611 if (B64CHAR(ch) || ch == '-') {
1612 *out++ = '-';
1613 }
1614 inShift = 0;
1615 *out++ = (char) ch;
1616 } else {
1617 bitsleft += 16;
1618 charsleft = (charsleft << 16) | ch;
1619 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1620
1621 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001622 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001623 or '-' then the shift sequence will be terminated implicitly and we
1624 don't have to insert a '-'. */
1625
1626 if (bitsleft == 0) {
1627 if (i + 1 < size) {
1628 Py_UNICODE ch2 = s[i+1];
1629
1630 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001631
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001632 } else if (B64CHAR(ch2) || ch2 == '-') {
1633 *out++ = '-';
1634 inShift = 0;
1635 } else {
1636 inShift = 0;
1637 }
1638
1639 }
1640 else {
1641 *out++ = '-';
1642 inShift = 0;
1643 }
1644 }
Tim Petersced69f82003-09-16 20:30:58 +00001645 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001646 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001647 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001648 if (bitsleft) {
1649 *out++= B64(charsleft << (6-bitsleft) );
1650 *out++ = '-';
1651 }
1652
Walter Dörwald51ab4142007-05-05 14:43:36 +00001653 if (PyBytes_Resize(v, out - start)) {
1654 Py_DECREF(v);
1655 return NULL;
1656 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001657 return v;
1658}
1659
1660#undef SPECIAL
1661#undef B64
1662#undef B64CHAR
1663#undef UB64
1664#undef ENCODE
1665#undef DECODE
1666
Guido van Rossumd57fd912000-03-10 22:53:23 +00001667/* --- UTF-8 Codec -------------------------------------------------------- */
1668
Tim Petersced69f82003-09-16 20:30:58 +00001669static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001670char utf8_code_length[256] = {
1671 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1672 illegal prefix. see RFC 2279 for details */
1673 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1674 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1675 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1676 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1677 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1678 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1679 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1680 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1681 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1682 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1683 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1684 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1685 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1686 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1687 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1688 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1689};
1690
Guido van Rossumd57fd912000-03-10 22:53:23 +00001691PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001692 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693 const char *errors)
1694{
Walter Dörwald69652032004-09-07 20:24:22 +00001695 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1696}
1697
1698PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001699 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001700 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001701 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001702{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001703 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001705 Py_ssize_t startinpos;
1706 Py_ssize_t endinpos;
1707 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708 const char *e;
1709 PyUnicodeObject *unicode;
1710 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001711 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001712 PyObject *errorHandler = NULL;
1713 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714
1715 /* Note: size will always be longer than the resulting Unicode
1716 character count */
1717 unicode = _PyUnicode_New(size);
1718 if (!unicode)
1719 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001720 if (size == 0) {
1721 if (consumed)
1722 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001724 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725
1726 /* Unpack UTF-8 encoded data */
1727 p = unicode->str;
1728 e = s + size;
1729
1730 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001731 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732
1733 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001734 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 s++;
1736 continue;
1737 }
1738
1739 n = utf8_code_length[ch];
1740
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001741 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001742 if (consumed)
1743 break;
1744 else {
1745 errmsg = "unexpected end of data";
1746 startinpos = s-starts;
1747 endinpos = size;
1748 goto utf8Error;
1749 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751
1752 switch (n) {
1753
1754 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001755 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001756 startinpos = s-starts;
1757 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001758 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759
1760 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001761 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001762 startinpos = s-starts;
1763 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001764 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765
1766 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001767 if ((s[1] & 0xc0) != 0x80) {
1768 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001769 startinpos = s-starts;
1770 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001771 goto utf8Error;
1772 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001774 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 startinpos = s-starts;
1776 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001777 errmsg = "illegal encoding";
1778 goto utf8Error;
1779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001781 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 break;
1783
1784 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001785 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001786 (s[2] & 0xc0) != 0x80) {
1787 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001788 startinpos = s-starts;
1789 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001790 goto utf8Error;
1791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001793 if (ch < 0x0800) {
1794 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001795 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001796
1797 XXX For wide builds (UCS-4) we should probably try
1798 to recombine the surrogates into a single code
1799 unit.
1800 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001801 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001802 startinpos = s-starts;
1803 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001804 goto utf8Error;
1805 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001807 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001808 break;
1809
1810 case 4:
1811 if ((s[1] & 0xc0) != 0x80 ||
1812 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001813 (s[3] & 0xc0) != 0x80) {
1814 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001815 startinpos = s-starts;
1816 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001817 goto utf8Error;
1818 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001819 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1820 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1821 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001822 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001823 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001824 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001825 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001826 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001827 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001828 startinpos = s-starts;
1829 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001830 goto utf8Error;
1831 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001832#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001833 *p++ = (Py_UNICODE)ch;
1834#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001835 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001836
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001837 /* translate from 10000..10FFFF to 0..FFFF */
1838 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001839
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001840 /* high surrogate = top 10 bits added to D800 */
1841 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001842
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001843 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001844 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001845#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 break;
1847
1848 default:
1849 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001850 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001851 startinpos = s-starts;
1852 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001853 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854 }
1855 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001856 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001857
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001858 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001859 outpos = p-PyUnicode_AS_UNICODE(unicode);
1860 if (unicode_decode_call_errorhandler(
1861 errors, &errorHandler,
1862 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001863 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001864 (PyObject **)&unicode, &outpos, &p))
1865 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866 }
Walter Dörwald69652032004-09-07 20:24:22 +00001867 if (consumed)
1868 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001869
1870 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001871 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001872 goto onError;
1873
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001874 Py_XDECREF(errorHandler);
1875 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876 return (PyObject *)unicode;
1877
1878onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001879 Py_XDECREF(errorHandler);
1880 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001881 Py_DECREF(unicode);
1882 return NULL;
1883}
1884
Tim Peters602f7402002-04-27 18:03:26 +00001885/* Allocation strategy: if the string is short, convert into a stack buffer
1886 and allocate exactly as much space needed at the end. Else allocate the
1887 maximum possible needed (4 result bytes per Unicode character), and return
1888 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001889*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001890PyObject *
1891PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001892 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001893 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001894{
Tim Peters602f7402002-04-27 18:03:26 +00001895#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001896
Martin v. Löwis18e16552006-02-15 17:27:45 +00001897 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001898 PyObject *v; /* result string object */
1899 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001900 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001901 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001902 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001903
Tim Peters602f7402002-04-27 18:03:26 +00001904 assert(s != NULL);
1905 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001906
Tim Peters602f7402002-04-27 18:03:26 +00001907 if (size <= MAX_SHORT_UNICHARS) {
1908 /* Write into the stack buffer; nallocated can't overflow.
1909 * At the end, we'll allocate exactly as much heap space as it
1910 * turns out we need.
1911 */
1912 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1913 v = NULL; /* will allocate after we're done */
1914 p = stackbuf;
1915 }
1916 else {
1917 /* Overallocate on the heap, and give the excess back at the end. */
1918 nallocated = size * 4;
1919 if (nallocated / 4 != size) /* overflow! */
1920 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001921 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001922 if (v == NULL)
1923 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001924 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001925 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001926
Tim Peters602f7402002-04-27 18:03:26 +00001927 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001928 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001929
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001930 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001931 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001933
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001935 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001936 *p++ = (char)(0xc0 | (ch >> 6));
1937 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001938 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001939 else {
Tim Peters602f7402002-04-27 18:03:26 +00001940 /* Encode UCS2 Unicode ordinals */
1941 if (ch < 0x10000) {
1942 /* Special case: check for high surrogate */
1943 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1944 Py_UCS4 ch2 = s[i];
1945 /* Check for low surrogate and combine the two to
1946 form a UCS4 value */
1947 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001948 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001949 i++;
1950 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001951 }
Tim Peters602f7402002-04-27 18:03:26 +00001952 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001953 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001954 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001955 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1956 *p++ = (char)(0x80 | (ch & 0x3f));
1957 continue;
1958 }
1959encodeUCS4:
1960 /* Encode UCS4 Unicode ordinals */
1961 *p++ = (char)(0xf0 | (ch >> 18));
1962 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1963 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1964 *p++ = (char)(0x80 | (ch & 0x3f));
1965 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001967
Tim Peters602f7402002-04-27 18:03:26 +00001968 if (v == NULL) {
1969 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001970 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001971 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001972 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001973 }
1974 else {
1975 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001976 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001977 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001978 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001979 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001980 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001981
Tim Peters602f7402002-04-27 18:03:26 +00001982#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983}
1984
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1986{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 if (!PyUnicode_Check(unicode)) {
1988 PyErr_BadArgument();
1989 return NULL;
1990 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001991 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1992 PyUnicode_GET_SIZE(unicode),
1993 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994}
1995
1996/* --- UTF-16 Codec ------------------------------------------------------- */
1997
Tim Peters772747b2001-08-09 22:21:55 +00001998PyObject *
1999PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002000 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002001 const char *errors,
2002 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002003{
Walter Dörwald69652032004-09-07 20:24:22 +00002004 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2005}
2006
2007PyObject *
2008PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002009 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002010 const char *errors,
2011 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002012 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002013{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002014 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002015 Py_ssize_t startinpos;
2016 Py_ssize_t endinpos;
2017 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002018 PyUnicodeObject *unicode;
2019 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002020 const unsigned char *q, *e;
2021 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002022 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002023 /* Offsets from q for retrieving byte pairs in the right order. */
2024#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2025 int ihi = 1, ilo = 0;
2026#else
2027 int ihi = 0, ilo = 1;
2028#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002029 PyObject *errorHandler = NULL;
2030 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031
2032 /* Note: size will always be longer than the resulting Unicode
2033 character count */
2034 unicode = _PyUnicode_New(size);
2035 if (!unicode)
2036 return NULL;
2037 if (size == 0)
2038 return (PyObject *)unicode;
2039
2040 /* Unpack UTF-16 encoded data */
2041 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002042 q = (unsigned char *)s;
2043 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044
2045 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002046 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002048 /* Check for BOM marks (U+FEFF) in the input and adjust current
2049 byte order setting accordingly. In native mode, the leading BOM
2050 mark is skipped, in all other modes, it is copied to the output
2051 stream as-is (giving a ZWNBSP character). */
2052 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002053 if (size >= 2) {
2054 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002055#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002056 if (bom == 0xFEFF) {
2057 q += 2;
2058 bo = -1;
2059 }
2060 else if (bom == 0xFFFE) {
2061 q += 2;
2062 bo = 1;
2063 }
Tim Petersced69f82003-09-16 20:30:58 +00002064#else
Walter Dörwald69652032004-09-07 20:24:22 +00002065 if (bom == 0xFEFF) {
2066 q += 2;
2067 bo = 1;
2068 }
2069 else if (bom == 0xFFFE) {
2070 q += 2;
2071 bo = -1;
2072 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002073#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002074 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002075 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076
Tim Peters772747b2001-08-09 22:21:55 +00002077 if (bo == -1) {
2078 /* force LE */
2079 ihi = 1;
2080 ilo = 0;
2081 }
2082 else if (bo == 1) {
2083 /* force BE */
2084 ihi = 0;
2085 ilo = 1;
2086 }
2087
2088 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002089 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002090 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002091 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002092 if (consumed)
2093 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002094 errmsg = "truncated data";
2095 startinpos = ((const char *)q)-starts;
2096 endinpos = ((const char *)e)-starts;
2097 goto utf16Error;
2098 /* The remaining input chars are ignored if the callback
2099 chooses to skip the input */
2100 }
2101 ch = (q[ihi] << 8) | q[ilo];
2102
Tim Peters772747b2001-08-09 22:21:55 +00002103 q += 2;
2104
Guido van Rossumd57fd912000-03-10 22:53:23 +00002105 if (ch < 0xD800 || ch > 0xDFFF) {
2106 *p++ = ch;
2107 continue;
2108 }
2109
2110 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002111 if (q >= e) {
2112 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002113 startinpos = (((const char *)q)-2)-starts;
2114 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002115 goto utf16Error;
2116 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002117 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002118 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2119 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002120 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002121#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002122 *p++ = ch;
2123 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002124#else
2125 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002126#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002127 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002128 }
2129 else {
2130 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002131 startinpos = (((const char *)q)-4)-starts;
2132 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002133 goto utf16Error;
2134 }
2135
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002137 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002138 startinpos = (((const char *)q)-2)-starts;
2139 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002140 /* Fall through to report the error */
2141
2142 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002143 outpos = p-PyUnicode_AS_UNICODE(unicode);
2144 if (unicode_decode_call_errorhandler(
2145 errors, &errorHandler,
2146 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002147 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002148 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002149 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002150 }
2151
2152 if (byteorder)
2153 *byteorder = bo;
2154
Walter Dörwald69652032004-09-07 20:24:22 +00002155 if (consumed)
2156 *consumed = (const char *)q-starts;
2157
Guido van Rossumd57fd912000-03-10 22:53:23 +00002158 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002159 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160 goto onError;
2161
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002162 Py_XDECREF(errorHandler);
2163 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002164 return (PyObject *)unicode;
2165
2166onError:
2167 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002168 Py_XDECREF(errorHandler);
2169 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170 return NULL;
2171}
2172
Tim Peters772747b2001-08-09 22:21:55 +00002173PyObject *
2174PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002175 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002176 const char *errors,
2177 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002178{
2179 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002180 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002181#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002182 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002183#else
2184 const int pairs = 0;
2185#endif
Tim Peters772747b2001-08-09 22:21:55 +00002186 /* Offsets from p for storing byte pairs in the right order. */
2187#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2188 int ihi = 1, ilo = 0;
2189#else
2190 int ihi = 0, ilo = 1;
2191#endif
2192
2193#define STORECHAR(CH) \
2194 do { \
2195 p[ihi] = ((CH) >> 8) & 0xff; \
2196 p[ilo] = (CH) & 0xff; \
2197 p += 2; \
2198 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002200#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002201 for (i = pairs = 0; i < size; i++)
2202 if (s[i] >= 0x10000)
2203 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002204#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002205 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002206 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 if (v == NULL)
2208 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209
Walter Dörwald3cc34522007-05-04 10:48:27 +00002210 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002212 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002213 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002214 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002215
2216 if (byteorder == -1) {
2217 /* force LE */
2218 ihi = 1;
2219 ilo = 0;
2220 }
2221 else if (byteorder == 1) {
2222 /* force BE */
2223 ihi = 0;
2224 ilo = 1;
2225 }
2226
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002227 while (size-- > 0) {
2228 Py_UNICODE ch = *s++;
2229 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002230#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002231 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002232 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2233 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002235#endif
Tim Peters772747b2001-08-09 22:21:55 +00002236 STORECHAR(ch);
2237 if (ch2)
2238 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002239 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002240 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002241#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002242}
2243
2244PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2245{
2246 if (!PyUnicode_Check(unicode)) {
2247 PyErr_BadArgument();
2248 return NULL;
2249 }
2250 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2251 PyUnicode_GET_SIZE(unicode),
2252 NULL,
2253 0);
2254}
2255
2256/* --- Unicode Escape Codec ----------------------------------------------- */
2257
Fredrik Lundh06d12682001-01-24 07:59:11 +00002258static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002259
Guido van Rossumd57fd912000-03-10 22:53:23 +00002260PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002261 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262 const char *errors)
2263{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002264 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002265 Py_ssize_t startinpos;
2266 Py_ssize_t endinpos;
2267 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002268 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002270 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002271 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002272 char* message;
2273 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002274 PyObject *errorHandler = NULL;
2275 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002276
Guido van Rossumd57fd912000-03-10 22:53:23 +00002277 /* Escaped strings will always be longer than the resulting
2278 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002279 length after conversion to the true value.
2280 (but if the error callback returns a long replacement string
2281 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002282 v = _PyUnicode_New(size);
2283 if (v == NULL)
2284 goto onError;
2285 if (size == 0)
2286 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002287
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002288 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002290
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 while (s < end) {
2292 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002293 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002294 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295
2296 /* Non-escape characters are interpreted as Unicode ordinals */
2297 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002298 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299 continue;
2300 }
2301
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002302 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 /* \ - Escapes */
2304 s++;
2305 switch (*s++) {
2306
2307 /* \x escapes */
2308 case '\n': break;
2309 case '\\': *p++ = '\\'; break;
2310 case '\'': *p++ = '\''; break;
2311 case '\"': *p++ = '\"'; break;
2312 case 'b': *p++ = '\b'; break;
2313 case 'f': *p++ = '\014'; break; /* FF */
2314 case 't': *p++ = '\t'; break;
2315 case 'n': *p++ = '\n'; break;
2316 case 'r': *p++ = '\r'; break;
2317 case 'v': *p++ = '\013'; break; /* VT */
2318 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2319
2320 /* \OOO (octal) escapes */
2321 case '0': case '1': case '2': case '3':
2322 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002323 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002324 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002325 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002326 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002327 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002328 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002329 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330 break;
2331
Fredrik Lundhccc74732001-02-18 22:13:49 +00002332 /* hex escapes */
2333 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002335 digits = 2;
2336 message = "truncated \\xXX escape";
2337 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002338
Fredrik Lundhccc74732001-02-18 22:13:49 +00002339 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002340 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002341 digits = 4;
2342 message = "truncated \\uXXXX escape";
2343 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002344
Fredrik Lundhccc74732001-02-18 22:13:49 +00002345 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002346 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002347 digits = 8;
2348 message = "truncated \\UXXXXXXXX escape";
2349 hexescape:
2350 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002351 outpos = p-PyUnicode_AS_UNICODE(v);
2352 if (s+digits>end) {
2353 endinpos = size;
2354 if (unicode_decode_call_errorhandler(
2355 errors, &errorHandler,
2356 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002357 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002358 (PyObject **)&v, &outpos, &p))
2359 goto onError;
2360 goto nextByte;
2361 }
2362 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002363 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002364 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002365 endinpos = (s+i+1)-starts;
2366 if (unicode_decode_call_errorhandler(
2367 errors, &errorHandler,
2368 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002369 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002370 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002371 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002372 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002373 }
2374 chr = (chr<<4) & ~0xF;
2375 if (c >= '0' && c <= '9')
2376 chr += c - '0';
2377 else if (c >= 'a' && c <= 'f')
2378 chr += 10 + c - 'a';
2379 else
2380 chr += 10 + c - 'A';
2381 }
2382 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002383 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002384 /* _decoding_error will have already written into the
2385 target buffer. */
2386 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002387 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002388 /* when we get here, chr is a 32-bit unicode character */
2389 if (chr <= 0xffff)
2390 /* UCS-2 character */
2391 *p++ = (Py_UNICODE) chr;
2392 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002393 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002394 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002395#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002396 *p++ = chr;
2397#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002398 chr -= 0x10000L;
2399 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002400 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002401#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002402 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002403 endinpos = s-starts;
2404 outpos = p-PyUnicode_AS_UNICODE(v);
2405 if (unicode_decode_call_errorhandler(
2406 errors, &errorHandler,
2407 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002408 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002409 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002410 goto onError;
2411 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002412 break;
2413
2414 /* \N{name} */
2415 case 'N':
2416 message = "malformed \\N character escape";
2417 if (ucnhash_CAPI == NULL) {
2418 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002419 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002420 m = PyImport_ImportModule("unicodedata");
2421 if (m == NULL)
2422 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002423 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002424 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002425 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002426 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002427 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002428 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002429 if (ucnhash_CAPI == NULL)
2430 goto ucnhashError;
2431 }
2432 if (*s == '{') {
2433 const char *start = s+1;
2434 /* look for the closing brace */
2435 while (*s != '}' && s < end)
2436 s++;
2437 if (s > start && s < end && *s == '}') {
2438 /* found a name. look it up in the unicode database */
2439 message = "unknown Unicode character name";
2440 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002441 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002442 goto store;
2443 }
2444 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002445 endinpos = s-starts;
2446 outpos = p-PyUnicode_AS_UNICODE(v);
2447 if (unicode_decode_call_errorhandler(
2448 errors, &errorHandler,
2449 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002450 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002451 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002452 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002453 break;
2454
2455 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002456 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002457 message = "\\ at end of string";
2458 s--;
2459 endinpos = s-starts;
2460 outpos = p-PyUnicode_AS_UNICODE(v);
2461 if (unicode_decode_call_errorhandler(
2462 errors, &errorHandler,
2463 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002464 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002465 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002466 goto onError;
2467 }
2468 else {
2469 *p++ = '\\';
2470 *p++ = (unsigned char)s[-1];
2471 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002472 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002473 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002474 nextByte:
2475 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002477 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002478 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002479 Py_XDECREF(errorHandler);
2480 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002481 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002482
Fredrik Lundhccc74732001-02-18 22:13:49 +00002483ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002484 PyErr_SetString(
2485 PyExc_UnicodeError,
2486 "\\N escapes not supported (can't load unicodedata module)"
2487 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002488 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002489 Py_XDECREF(errorHandler);
2490 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002491 return NULL;
2492
Fredrik Lundhccc74732001-02-18 22:13:49 +00002493onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002494 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002495 Py_XDECREF(errorHandler);
2496 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497 return NULL;
2498}
2499
2500/* Return a Unicode-Escape string version of the Unicode object.
2501
2502 If quotes is true, the string is enclosed in u"" or u'' quotes as
2503 appropriate.
2504
2505*/
2506
Thomas Wouters477c8d52006-05-27 19:21:47 +00002507Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2508 Py_ssize_t size,
2509 Py_UNICODE ch)
2510{
2511 /* like wcschr, but doesn't stop at NULL characters */
2512
2513 while (size-- > 0) {
2514 if (*s == ch)
2515 return s;
2516 s++;
2517 }
2518
2519 return NULL;
2520}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002521
Walter Dörwald79e913e2007-05-12 11:08:06 +00002522static const char *hexdigits = "0123456789abcdef";
2523
2524PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2525 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526{
2527 PyObject *repr;
2528 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529
Thomas Wouters89f507f2006-12-13 04:49:30 +00002530 /* XXX(nnorwitz): rather than over-allocating, it would be
2531 better to choose a different scheme. Perhaps scan the
2532 first N-chars of the string and allocate based on that size.
2533 */
2534 /* Initial allocation is based on the longest-possible unichr
2535 escape.
2536
2537 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2538 unichr, so in this case it's the longest unichr escape. In
2539 narrow (UTF-16) builds this is five chars per source unichr
2540 since there are two unichrs in the surrogate pair, so in narrow
2541 (UTF-16) builds it's not the longest unichr escape.
2542
2543 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2544 so in the narrow (UTF-16) build case it's the longest unichr
2545 escape.
2546 */
2547
Walter Dörwald79e913e2007-05-12 11:08:06 +00002548 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002549#ifdef Py_UNICODE_WIDE
2550 + 10*size
2551#else
2552 + 6*size
2553#endif
2554 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555 if (repr == NULL)
2556 return NULL;
2557
Walter Dörwald79e913e2007-05-12 11:08:06 +00002558 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559
Guido van Rossumd57fd912000-03-10 22:53:23 +00002560 while (size-- > 0) {
2561 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002562
Walter Dörwald79e913e2007-05-12 11:08:06 +00002563 /* Escape backslashes */
2564 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002565 *p++ = '\\';
2566 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002567 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002568 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002569
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002570#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002571 /* Map 21-bit characters to '\U00xxxxxx' */
2572 else if (ch >= 0x10000) {
2573 *p++ = '\\';
2574 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002575 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2576 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2577 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2578 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2579 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2580 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2581 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2582 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002583 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002584 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002585#else
2586 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002587 else if (ch >= 0xD800 && ch < 0xDC00) {
2588 Py_UNICODE ch2;
2589 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002590
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002591 ch2 = *s++;
2592 size--;
2593 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2594 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2595 *p++ = '\\';
2596 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002597 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2598 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2599 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2600 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2601 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2602 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2603 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2604 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002605 continue;
2606 }
2607 /* Fall through: isolated surrogates are copied as-is */
2608 s--;
2609 size++;
2610 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002611#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002612
Guido van Rossumd57fd912000-03-10 22:53:23 +00002613 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002614 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615 *p++ = '\\';
2616 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002617 *p++ = hexdigits[(ch >> 12) & 0x000F];
2618 *p++ = hexdigits[(ch >> 8) & 0x000F];
2619 *p++ = hexdigits[(ch >> 4) & 0x000F];
2620 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002621 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002622
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002623 /* Map special whitespace to '\t', \n', '\r' */
2624 else if (ch == '\t') {
2625 *p++ = '\\';
2626 *p++ = 't';
2627 }
2628 else if (ch == '\n') {
2629 *p++ = '\\';
2630 *p++ = 'n';
2631 }
2632 else if (ch == '\r') {
2633 *p++ = '\\';
2634 *p++ = 'r';
2635 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002636
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002637 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002638 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002639 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002640 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002641 *p++ = hexdigits[(ch >> 4) & 0x000F];
2642 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002643 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002644
Guido van Rossumd57fd912000-03-10 22:53:23 +00002645 /* Copy everything else as-is */
2646 else
2647 *p++ = (char) ch;
2648 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649
2650 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002651 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2652 Py_DECREF(repr);
2653 return NULL;
2654 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002655 return repr;
2656}
2657
Guido van Rossumd57fd912000-03-10 22:53:23 +00002658PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2659{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002660 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661 if (!PyUnicode_Check(unicode)) {
2662 PyErr_BadArgument();
2663 return NULL;
2664 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002665 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2666 PyUnicode_GET_SIZE(unicode));
2667
2668 if (!s)
2669 return NULL;
2670 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2671 PyBytes_GET_SIZE(s));
2672 Py_DECREF(s);
2673 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002674}
2675
2676/* --- Raw Unicode Escape Codec ------------------------------------------- */
2677
2678PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002679 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 const char *errors)
2681{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002682 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002683 Py_ssize_t startinpos;
2684 Py_ssize_t endinpos;
2685 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002687 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002688 const char *end;
2689 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002690 PyObject *errorHandler = NULL;
2691 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002692
Guido van Rossumd57fd912000-03-10 22:53:23 +00002693 /* Escaped strings will always be longer than the resulting
2694 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002695 length after conversion to the true value. (But decoding error
2696 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 v = _PyUnicode_New(size);
2698 if (v == NULL)
2699 goto onError;
2700 if (size == 0)
2701 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002702 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703 end = s + size;
2704 while (s < end) {
2705 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002706 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002707 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002708 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709
2710 /* Non-escape characters are interpreted as Unicode ordinals */
2711 if (*s != '\\') {
2712 *p++ = (unsigned char)*s++;
2713 continue;
2714 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002715 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716
2717 /* \u-escapes are only interpreted iff the number of leading
2718 backslashes if odd */
2719 bs = s;
2720 for (;s < end;) {
2721 if (*s != '\\')
2722 break;
2723 *p++ = (unsigned char)*s++;
2724 }
2725 if (((s - bs) & 1) == 0 ||
2726 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002727 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728 continue;
2729 }
2730 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002731 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732 s++;
2733
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002734 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002735 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002736 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002737 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002739 endinpos = s-starts;
2740 if (unicode_decode_call_errorhandler(
2741 errors, &errorHandler,
2742 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002743 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747 }
2748 x = (x<<4) & ~0xF;
2749 if (c >= '0' && c <= '9')
2750 x += c - '0';
2751 else if (c >= 'a' && c <= 'f')
2752 x += 10 + c - 'a';
2753 else
2754 x += 10 + c - 'A';
2755 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002756#ifndef Py_UNICODE_WIDE
2757 if (x > 0x10000) {
2758 if (unicode_decode_call_errorhandler(
2759 errors, &errorHandler,
2760 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002761 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002762 (PyObject **)&v, &outpos, &p))
2763 goto onError;
2764 }
2765#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002766 *p++ = x;
2767 nextByte:
2768 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002769 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002770 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002771 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002772 Py_XDECREF(errorHandler);
2773 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002775
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776 onError:
2777 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002778 Py_XDECREF(errorHandler);
2779 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 return NULL;
2781}
2782
2783PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002784 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785{
2786 PyObject *repr;
2787 char *p;
2788 char *q;
2789
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002790#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002791 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002792#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002793 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002794#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795 if (repr == NULL)
2796 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002797 if (size == 0)
2798 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799
Walter Dörwald711005d2007-05-12 12:03:26 +00002800 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801 while (size-- > 0) {
2802 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002803#ifdef Py_UNICODE_WIDE
2804 /* Map 32-bit characters to '\Uxxxxxxxx' */
2805 if (ch >= 0x10000) {
2806 *p++ = '\\';
2807 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002808 *p++ = hexdigits[(ch >> 28) & 0xf];
2809 *p++ = hexdigits[(ch >> 24) & 0xf];
2810 *p++ = hexdigits[(ch >> 20) & 0xf];
2811 *p++ = hexdigits[(ch >> 16) & 0xf];
2812 *p++ = hexdigits[(ch >> 12) & 0xf];
2813 *p++ = hexdigits[(ch >> 8) & 0xf];
2814 *p++ = hexdigits[(ch >> 4) & 0xf];
2815 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002816 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002817 else
2818#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819 /* Map 16-bit characters to '\uxxxx' */
2820 if (ch >= 256) {
2821 *p++ = '\\';
2822 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002823 *p++ = hexdigits[(ch >> 12) & 0xf];
2824 *p++ = hexdigits[(ch >> 8) & 0xf];
2825 *p++ = hexdigits[(ch >> 4) & 0xf];
2826 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827 }
2828 /* Copy everything else as-is */
2829 else
2830 *p++ = (char) ch;
2831 }
2832 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002833 if (PyBytes_Resize(repr, p - q)) {
2834 Py_DECREF(repr);
2835 return NULL;
2836 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837 return repr;
2838}
2839
2840PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2841{
Walter Dörwald711005d2007-05-12 12:03:26 +00002842 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002844 PyErr_BadArgument();
2845 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002847 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2848 PyUnicode_GET_SIZE(unicode));
2849
2850 if (!s)
2851 return NULL;
2852 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2853 PyBytes_GET_SIZE(s));
2854 Py_DECREF(s);
2855 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856}
2857
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002858/* --- Unicode Internal Codec ------------------------------------------- */
2859
2860PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002861 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002862 const char *errors)
2863{
2864 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002865 Py_ssize_t startinpos;
2866 Py_ssize_t endinpos;
2867 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002868 PyUnicodeObject *v;
2869 Py_UNICODE *p;
2870 const char *end;
2871 const char *reason;
2872 PyObject *errorHandler = NULL;
2873 PyObject *exc = NULL;
2874
Neal Norwitzd43069c2006-01-08 01:12:10 +00002875#ifdef Py_UNICODE_WIDE
2876 Py_UNICODE unimax = PyUnicode_GetMax();
2877#endif
2878
Thomas Wouters89f507f2006-12-13 04:49:30 +00002879 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002880 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2881 if (v == NULL)
2882 goto onError;
2883 if (PyUnicode_GetSize((PyObject *)v) == 0)
2884 return (PyObject *)v;
2885 p = PyUnicode_AS_UNICODE(v);
2886 end = s + size;
2887
2888 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002889 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002890 /* We have to sanity check the raw data, otherwise doom looms for
2891 some malformed UCS-4 data. */
2892 if (
2893 #ifdef Py_UNICODE_WIDE
2894 *p > unimax || *p < 0 ||
2895 #endif
2896 end-s < Py_UNICODE_SIZE
2897 )
2898 {
2899 startinpos = s - starts;
2900 if (end-s < Py_UNICODE_SIZE) {
2901 endinpos = end-starts;
2902 reason = "truncated input";
2903 }
2904 else {
2905 endinpos = s - starts + Py_UNICODE_SIZE;
2906 reason = "illegal code point (> 0x10FFFF)";
2907 }
2908 outpos = p - PyUnicode_AS_UNICODE(v);
2909 if (unicode_decode_call_errorhandler(
2910 errors, &errorHandler,
2911 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002912 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002913 (PyObject **)&v, &outpos, &p)) {
2914 goto onError;
2915 }
2916 }
2917 else {
2918 p++;
2919 s += Py_UNICODE_SIZE;
2920 }
2921 }
2922
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002923 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002924 goto onError;
2925 Py_XDECREF(errorHandler);
2926 Py_XDECREF(exc);
2927 return (PyObject *)v;
2928
2929 onError:
2930 Py_XDECREF(v);
2931 Py_XDECREF(errorHandler);
2932 Py_XDECREF(exc);
2933 return NULL;
2934}
2935
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936/* --- Latin-1 Codec ------------------------------------------------------ */
2937
2938PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002939 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940 const char *errors)
2941{
2942 PyUnicodeObject *v;
2943 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002944
Guido van Rossumd57fd912000-03-10 22:53:23 +00002945 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002946 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002947 Py_UNICODE r = *(unsigned char*)s;
2948 return PyUnicode_FromUnicode(&r, 1);
2949 }
2950
Guido van Rossumd57fd912000-03-10 22:53:23 +00002951 v = _PyUnicode_New(size);
2952 if (v == NULL)
2953 goto onError;
2954 if (size == 0)
2955 return (PyObject *)v;
2956 p = PyUnicode_AS_UNICODE(v);
2957 while (size-- > 0)
2958 *p++ = (unsigned char)*s++;
2959 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002960
Guido van Rossumd57fd912000-03-10 22:53:23 +00002961 onError:
2962 Py_XDECREF(v);
2963 return NULL;
2964}
2965
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002966/* create or adjust a UnicodeEncodeError */
2967static void make_encode_exception(PyObject **exceptionObject,
2968 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002969 const Py_UNICODE *unicode, Py_ssize_t size,
2970 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002971 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002973 if (*exceptionObject == NULL) {
2974 *exceptionObject = PyUnicodeEncodeError_Create(
2975 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976 }
2977 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002978 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2979 goto onError;
2980 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2981 goto onError;
2982 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2983 goto onError;
2984 return;
2985 onError:
2986 Py_DECREF(*exceptionObject);
2987 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 }
2989}
2990
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002991/* raises a UnicodeEncodeError */
2992static void raise_encode_exception(PyObject **exceptionObject,
2993 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002994 const Py_UNICODE *unicode, Py_ssize_t size,
2995 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002996 const char *reason)
2997{
2998 make_encode_exception(exceptionObject,
2999 encoding, unicode, size, startpos, endpos, reason);
3000 if (*exceptionObject != NULL)
3001 PyCodec_StrictErrors(*exceptionObject);
3002}
3003
3004/* error handling callback helper:
3005 build arguments, call the callback and check the arguments,
3006 put the result into newpos and return the replacement string, which
3007 has to be freed by the caller */
3008static PyObject *unicode_encode_call_errorhandler(const char *errors,
3009 PyObject **errorHandler,
3010 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003011 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3012 Py_ssize_t startpos, Py_ssize_t endpos,
3013 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003014{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003015 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003016
3017 PyObject *restuple;
3018 PyObject *resunicode;
3019
3020 if (*errorHandler == NULL) {
3021 *errorHandler = PyCodec_LookupError(errors);
3022 if (*errorHandler == NULL)
3023 return NULL;
3024 }
3025
3026 make_encode_exception(exceptionObject,
3027 encoding, unicode, size, startpos, endpos, reason);
3028 if (*exceptionObject == NULL)
3029 return NULL;
3030
3031 restuple = PyObject_CallFunctionObjArgs(
3032 *errorHandler, *exceptionObject, NULL);
3033 if (restuple == NULL)
3034 return NULL;
3035 if (!PyTuple_Check(restuple)) {
3036 PyErr_Format(PyExc_TypeError, &argparse[4]);
3037 Py_DECREF(restuple);
3038 return NULL;
3039 }
3040 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3041 &resunicode, newpos)) {
3042 Py_DECREF(restuple);
3043 return NULL;
3044 }
3045 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003046 *newpos = size+*newpos;
3047 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003048 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003049 Py_DECREF(restuple);
3050 return NULL;
3051 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003052 Py_INCREF(resunicode);
3053 Py_DECREF(restuple);
3054 return resunicode;
3055}
3056
3057static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003058 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 const char *errors,
3060 int limit)
3061{
3062 /* output object */
3063 PyObject *res;
3064 /* pointers to the beginning and end+1 of input */
3065 const Py_UNICODE *startp = p;
3066 const Py_UNICODE *endp = p + size;
3067 /* pointer to the beginning of the unencodable characters */
3068 /* const Py_UNICODE *badp = NULL; */
3069 /* pointer into the output */
3070 char *str;
3071 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003072 Py_ssize_t respos = 0;
3073 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003074 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3075 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003076 PyObject *errorHandler = NULL;
3077 PyObject *exc = NULL;
3078 /* the following variable is used for caching string comparisons
3079 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3080 int known_errorHandler = -1;
3081
3082 /* allocate enough for a simple encoding without
3083 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003084 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003085 if (res == NULL)
3086 goto onError;
3087 if (size == 0)
3088 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003089 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003090 ressize = size;
3091
3092 while (p<endp) {
3093 Py_UNICODE c = *p;
3094
3095 /* can we encode this? */
3096 if (c<limit) {
3097 /* no overflow check, because we know that the space is enough */
3098 *str++ = (char)c;
3099 ++p;
3100 }
3101 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003102 Py_ssize_t unicodepos = p-startp;
3103 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003104 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003105 Py_ssize_t repsize;
3106 Py_ssize_t newpos;
3107 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003108 Py_UNICODE *uni2;
3109 /* startpos for collecting unencodable chars */
3110 const Py_UNICODE *collstart = p;
3111 const Py_UNICODE *collend = p;
3112 /* find all unecodable characters */
3113 while ((collend < endp) && ((*collend)>=limit))
3114 ++collend;
3115 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3116 if (known_errorHandler==-1) {
3117 if ((errors==NULL) || (!strcmp(errors, "strict")))
3118 known_errorHandler = 1;
3119 else if (!strcmp(errors, "replace"))
3120 known_errorHandler = 2;
3121 else if (!strcmp(errors, "ignore"))
3122 known_errorHandler = 3;
3123 else if (!strcmp(errors, "xmlcharrefreplace"))
3124 known_errorHandler = 4;
3125 else
3126 known_errorHandler = 0;
3127 }
3128 switch (known_errorHandler) {
3129 case 1: /* strict */
3130 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3131 goto onError;
3132 case 2: /* replace */
3133 while (collstart++<collend)
3134 *str++ = '?'; /* fall through */
3135 case 3: /* ignore */
3136 p = collend;
3137 break;
3138 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003139 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003140 /* determine replacement size (temporarily (mis)uses p) */
3141 for (p = collstart, repsize = 0; p < collend; ++p) {
3142 if (*p<10)
3143 repsize += 2+1+1;
3144 else if (*p<100)
3145 repsize += 2+2+1;
3146 else if (*p<1000)
3147 repsize += 2+3+1;
3148 else if (*p<10000)
3149 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003150#ifndef Py_UNICODE_WIDE
3151 else
3152 repsize += 2+5+1;
3153#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003154 else if (*p<100000)
3155 repsize += 2+5+1;
3156 else if (*p<1000000)
3157 repsize += 2+6+1;
3158 else
3159 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003160#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003161 }
3162 requiredsize = respos+repsize+(endp-collend);
3163 if (requiredsize > ressize) {
3164 if (requiredsize<2*ressize)
3165 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003166 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003167 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003168 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003169 ressize = requiredsize;
3170 }
3171 /* generate replacement (temporarily (mis)uses p) */
3172 for (p = collstart; p < collend; ++p) {
3173 str += sprintf(str, "&#%d;", (int)*p);
3174 }
3175 p = collend;
3176 break;
3177 default:
3178 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3179 encoding, reason, startp, size, &exc,
3180 collstart-startp, collend-startp, &newpos);
3181 if (repunicode == NULL)
3182 goto onError;
3183 /* need more space? (at least enough for what we
3184 have+the replacement+the rest of the string, so
3185 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003186 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003187 repsize = PyUnicode_GET_SIZE(repunicode);
3188 requiredsize = respos+repsize+(endp-collend);
3189 if (requiredsize > ressize) {
3190 if (requiredsize<2*ressize)
3191 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003192 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003193 Py_DECREF(repunicode);
3194 goto onError;
3195 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003196 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003197 ressize = requiredsize;
3198 }
3199 /* check if there is anything unencodable in the replacement
3200 and copy it to the output */
3201 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3202 c = *uni2;
3203 if (c >= limit) {
3204 raise_encode_exception(&exc, encoding, startp, size,
3205 unicodepos, unicodepos+1, reason);
3206 Py_DECREF(repunicode);
3207 goto onError;
3208 }
3209 *str = (char)c;
3210 }
3211 p = startp + newpos;
3212 Py_DECREF(repunicode);
3213 }
3214 }
3215 }
3216 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003217 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003218 if (respos<ressize)
3219 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003220 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003221 Py_XDECREF(errorHandler);
3222 Py_XDECREF(exc);
3223 return res;
3224
3225 onError:
3226 Py_XDECREF(res);
3227 Py_XDECREF(errorHandler);
3228 Py_XDECREF(exc);
3229 return NULL;
3230}
3231
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003233 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 const char *errors)
3235{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237}
3238
3239PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3240{
3241 if (!PyUnicode_Check(unicode)) {
3242 PyErr_BadArgument();
3243 return NULL;
3244 }
3245 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3246 PyUnicode_GET_SIZE(unicode),
3247 NULL);
3248}
3249
3250/* --- 7-bit ASCII Codec -------------------------------------------------- */
3251
Guido van Rossumd57fd912000-03-10 22:53:23 +00003252PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003253 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 const char *errors)
3255{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 PyUnicodeObject *v;
3258 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003259 Py_ssize_t startinpos;
3260 Py_ssize_t endinpos;
3261 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003262 const char *e;
3263 PyObject *errorHandler = NULL;
3264 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003265
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003267 if (size == 1 && *(unsigned char*)s < 128) {
3268 Py_UNICODE r = *(unsigned char*)s;
3269 return PyUnicode_FromUnicode(&r, 1);
3270 }
Tim Petersced69f82003-09-16 20:30:58 +00003271
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272 v = _PyUnicode_New(size);
3273 if (v == NULL)
3274 goto onError;
3275 if (size == 0)
3276 return (PyObject *)v;
3277 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003278 e = s + size;
3279 while (s < e) {
3280 register unsigned char c = (unsigned char)*s;
3281 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003282 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003283 ++s;
3284 }
3285 else {
3286 startinpos = s-starts;
3287 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003288 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003289 if (unicode_decode_call_errorhandler(
3290 errors, &errorHandler,
3291 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003292 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003293 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003295 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003297 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003298 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003299 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003300 Py_XDECREF(errorHandler);
3301 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003303
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304 onError:
3305 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003306 Py_XDECREF(errorHandler);
3307 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308 return NULL;
3309}
3310
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003312 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313 const char *errors)
3314{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003315 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316}
3317
3318PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3319{
3320 if (!PyUnicode_Check(unicode)) {
3321 PyErr_BadArgument();
3322 return NULL;
3323 }
3324 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3325 PyUnicode_GET_SIZE(unicode),
3326 NULL);
3327}
3328
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003329#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003330
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003331/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003332
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003333#if SIZEOF_INT < SIZEOF_SSIZE_T
3334#define NEED_RETRY
3335#endif
3336
3337/* XXX This code is limited to "true" double-byte encodings, as
3338 a) it assumes an incomplete character consists of a single byte, and
3339 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3340 encodings, see IsDBCSLeadByteEx documentation. */
3341
3342static int is_dbcs_lead_byte(const char *s, int offset)
3343{
3344 const char *curr = s + offset;
3345
3346 if (IsDBCSLeadByte(*curr)) {
3347 const char *prev = CharPrev(s, curr);
3348 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3349 }
3350 return 0;
3351}
3352
3353/*
3354 * Decode MBCS string into unicode object. If 'final' is set, converts
3355 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3356 */
3357static int decode_mbcs(PyUnicodeObject **v,
3358 const char *s, /* MBCS string */
3359 int size, /* sizeof MBCS string */
3360 int final)
3361{
3362 Py_UNICODE *p;
3363 Py_ssize_t n = 0;
3364 int usize = 0;
3365
3366 assert(size >= 0);
3367
3368 /* Skip trailing lead-byte unless 'final' is set */
3369 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3370 --size;
3371
3372 /* First get the size of the result */
3373 if (size > 0) {
3374 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3375 if (usize == 0) {
3376 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3377 return -1;
3378 }
3379 }
3380
3381 if (*v == NULL) {
3382 /* Create unicode object */
3383 *v = _PyUnicode_New(usize);
3384 if (*v == NULL)
3385 return -1;
3386 }
3387 else {
3388 /* Extend unicode object */
3389 n = PyUnicode_GET_SIZE(*v);
3390 if (_PyUnicode_Resize(v, n + usize) < 0)
3391 return -1;
3392 }
3393
3394 /* Do the conversion */
3395 if (size > 0) {
3396 p = PyUnicode_AS_UNICODE(*v) + n;
3397 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3398 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3399 return -1;
3400 }
3401 }
3402
3403 return size;
3404}
3405
3406PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3407 Py_ssize_t size,
3408 const char *errors,
3409 Py_ssize_t *consumed)
3410{
3411 PyUnicodeObject *v = NULL;
3412 int done;
3413
3414 if (consumed)
3415 *consumed = 0;
3416
3417#ifdef NEED_RETRY
3418 retry:
3419 if (size > INT_MAX)
3420 done = decode_mbcs(&v, s, INT_MAX, 0);
3421 else
3422#endif
3423 done = decode_mbcs(&v, s, (int)size, !consumed);
3424
3425 if (done < 0) {
3426 Py_XDECREF(v);
3427 return NULL;
3428 }
3429
3430 if (consumed)
3431 *consumed += done;
3432
3433#ifdef NEED_RETRY
3434 if (size > INT_MAX) {
3435 s += done;
3436 size -= done;
3437 goto retry;
3438 }
3439#endif
3440
3441 return (PyObject *)v;
3442}
3443
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003444PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003445 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003446 const char *errors)
3447{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003448 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3449}
3450
3451/*
3452 * Convert unicode into string object (MBCS).
3453 * Returns 0 if succeed, -1 otherwise.
3454 */
3455static int encode_mbcs(PyObject **repr,
3456 const Py_UNICODE *p, /* unicode */
3457 int size) /* size of unicode */
3458{
3459 int mbcssize = 0;
3460 Py_ssize_t n = 0;
3461
3462 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003463
3464 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003465 if (size > 0) {
3466 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3467 if (mbcssize == 0) {
3468 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3469 return -1;
3470 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003471 }
3472
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003473 if (*repr == NULL) {
3474 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003475 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003476 if (*repr == NULL)
3477 return -1;
3478 }
3479 else {
3480 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003481 n = PyBytes_Size(*repr);
3482 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003483 return -1;
3484 }
3485
3486 /* Do the conversion */
3487 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003488 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003489 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3490 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3491 return -1;
3492 }
3493 }
3494
3495 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003496}
3497
3498PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003499 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003500 const char *errors)
3501{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003502 PyObject *repr = NULL;
3503 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003504
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003505#ifdef NEED_RETRY
3506 retry:
3507 if (size > INT_MAX)
3508 ret = encode_mbcs(&repr, p, INT_MAX);
3509 else
3510#endif
3511 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003512
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003513 if (ret < 0) {
3514 Py_XDECREF(repr);
3515 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003516 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003517
3518#ifdef NEED_RETRY
3519 if (size > INT_MAX) {
3520 p += INT_MAX;
3521 size -= INT_MAX;
3522 goto retry;
3523 }
3524#endif
3525
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003526 return repr;
3527}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003528
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003529PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3530{
3531 if (!PyUnicode_Check(unicode)) {
3532 PyErr_BadArgument();
3533 return NULL;
3534 }
3535 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3536 PyUnicode_GET_SIZE(unicode),
3537 NULL);
3538}
3539
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003540#undef NEED_RETRY
3541
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003542#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003543
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544/* --- Character Mapping Codec -------------------------------------------- */
3545
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003547 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548 PyObject *mapping,
3549 const char *errors)
3550{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003552 Py_ssize_t startinpos;
3553 Py_ssize_t endinpos;
3554 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556 PyUnicodeObject *v;
3557 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003558 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 PyObject *errorHandler = NULL;
3560 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003561 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003562 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003563
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564 /* Default to Latin-1 */
3565 if (mapping == NULL)
3566 return PyUnicode_DecodeLatin1(s, size, errors);
3567
3568 v = _PyUnicode_New(size);
3569 if (v == NULL)
3570 goto onError;
3571 if (size == 0)
3572 return (PyObject *)v;
3573 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003575 if (PyUnicode_CheckExact(mapping)) {
3576 mapstring = PyUnicode_AS_UNICODE(mapping);
3577 maplen = PyUnicode_GET_SIZE(mapping);
3578 while (s < e) {
3579 unsigned char ch = *s;
3580 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003581
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003582 if (ch < maplen)
3583 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003585 if (x == 0xfffe) {
3586 /* undefined mapping */
3587 outpos = p-PyUnicode_AS_UNICODE(v);
3588 startinpos = s-starts;
3589 endinpos = startinpos+1;
3590 if (unicode_decode_call_errorhandler(
3591 errors, &errorHandler,
3592 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003593 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003594 (PyObject **)&v, &outpos, &p)) {
3595 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003596 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003597 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003598 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003599 *p++ = x;
3600 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003601 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003602 }
3603 else {
3604 while (s < e) {
3605 unsigned char ch = *s;
3606 PyObject *w, *x;
3607
3608 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3609 w = PyInt_FromLong((long)ch);
3610 if (w == NULL)
3611 goto onError;
3612 x = PyObject_GetItem(mapping, w);
3613 Py_DECREF(w);
3614 if (x == NULL) {
3615 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3616 /* No mapping found means: mapping is undefined. */
3617 PyErr_Clear();
3618 x = Py_None;
3619 Py_INCREF(x);
3620 } else
3621 goto onError;
3622 }
3623
3624 /* Apply mapping */
3625 if (PyInt_Check(x)) {
3626 long value = PyInt_AS_LONG(x);
3627 if (value < 0 || value > 65535) {
3628 PyErr_SetString(PyExc_TypeError,
3629 "character mapping must be in range(65536)");
3630 Py_DECREF(x);
3631 goto onError;
3632 }
3633 *p++ = (Py_UNICODE)value;
3634 }
3635 else if (x == Py_None) {
3636 /* undefined mapping */
3637 outpos = p-PyUnicode_AS_UNICODE(v);
3638 startinpos = s-starts;
3639 endinpos = startinpos+1;
3640 if (unicode_decode_call_errorhandler(
3641 errors, &errorHandler,
3642 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003643 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003644 (PyObject **)&v, &outpos, &p)) {
3645 Py_DECREF(x);
3646 goto onError;
3647 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003648 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003649 continue;
3650 }
3651 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003652 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003653
3654 if (targetsize == 1)
3655 /* 1-1 mapping */
3656 *p++ = *PyUnicode_AS_UNICODE(x);
3657
3658 else if (targetsize > 1) {
3659 /* 1-n mapping */
3660 if (targetsize > extrachars) {
3661 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003662 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3663 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003664 (targetsize << 2);
3665 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003666 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003667 if (_PyUnicode_Resize(&v,
3668 PyUnicode_GET_SIZE(v) + needed) < 0) {
3669 Py_DECREF(x);
3670 goto onError;
3671 }
3672 p = PyUnicode_AS_UNICODE(v) + oldpos;
3673 }
3674 Py_UNICODE_COPY(p,
3675 PyUnicode_AS_UNICODE(x),
3676 targetsize);
3677 p += targetsize;
3678 extrachars -= targetsize;
3679 }
3680 /* 1-0 mapping: skip the character */
3681 }
3682 else {
3683 /* wrong return value */
3684 PyErr_SetString(PyExc_TypeError,
3685 "character mapping must return integer, None or unicode");
3686 Py_DECREF(x);
3687 goto onError;
3688 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003690 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692 }
3693 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003694 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003696 Py_XDECREF(errorHandler);
3697 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003699
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003701 Py_XDECREF(errorHandler);
3702 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703 Py_XDECREF(v);
3704 return NULL;
3705}
3706
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003707/* Charmap encoding: the lookup table */
3708
3709struct encoding_map{
3710 PyObject_HEAD
3711 unsigned char level1[32];
3712 int count2, count3;
3713 unsigned char level23[1];
3714};
3715
3716static PyObject*
3717encoding_map_size(PyObject *obj, PyObject* args)
3718{
3719 struct encoding_map *map = (struct encoding_map*)obj;
3720 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3721 128*map->count3);
3722}
3723
3724static PyMethodDef encoding_map_methods[] = {
3725 {"size", encoding_map_size, METH_NOARGS,
3726 PyDoc_STR("Return the size (in bytes) of this object") },
3727 { 0 }
3728};
3729
3730static void
3731encoding_map_dealloc(PyObject* o)
3732{
3733 PyObject_FREE(o);
3734}
3735
3736static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003737 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003738 "EncodingMap", /*tp_name*/
3739 sizeof(struct encoding_map), /*tp_basicsize*/
3740 0, /*tp_itemsize*/
3741 /* methods */
3742 encoding_map_dealloc, /*tp_dealloc*/
3743 0, /*tp_print*/
3744 0, /*tp_getattr*/
3745 0, /*tp_setattr*/
3746 0, /*tp_compare*/
3747 0, /*tp_repr*/
3748 0, /*tp_as_number*/
3749 0, /*tp_as_sequence*/
3750 0, /*tp_as_mapping*/
3751 0, /*tp_hash*/
3752 0, /*tp_call*/
3753 0, /*tp_str*/
3754 0, /*tp_getattro*/
3755 0, /*tp_setattro*/
3756 0, /*tp_as_buffer*/
3757 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3758 0, /*tp_doc*/
3759 0, /*tp_traverse*/
3760 0, /*tp_clear*/
3761 0, /*tp_richcompare*/
3762 0, /*tp_weaklistoffset*/
3763 0, /*tp_iter*/
3764 0, /*tp_iternext*/
3765 encoding_map_methods, /*tp_methods*/
3766 0, /*tp_members*/
3767 0, /*tp_getset*/
3768 0, /*tp_base*/
3769 0, /*tp_dict*/
3770 0, /*tp_descr_get*/
3771 0, /*tp_descr_set*/
3772 0, /*tp_dictoffset*/
3773 0, /*tp_init*/
3774 0, /*tp_alloc*/
3775 0, /*tp_new*/
3776 0, /*tp_free*/
3777 0, /*tp_is_gc*/
3778};
3779
3780PyObject*
3781PyUnicode_BuildEncodingMap(PyObject* string)
3782{
3783 Py_UNICODE *decode;
3784 PyObject *result;
3785 struct encoding_map *mresult;
3786 int i;
3787 int need_dict = 0;
3788 unsigned char level1[32];
3789 unsigned char level2[512];
3790 unsigned char *mlevel1, *mlevel2, *mlevel3;
3791 int count2 = 0, count3 = 0;
3792
3793 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3794 PyErr_BadArgument();
3795 return NULL;
3796 }
3797 decode = PyUnicode_AS_UNICODE(string);
3798 memset(level1, 0xFF, sizeof level1);
3799 memset(level2, 0xFF, sizeof level2);
3800
3801 /* If there isn't a one-to-one mapping of NULL to \0,
3802 or if there are non-BMP characters, we need to use
3803 a mapping dictionary. */
3804 if (decode[0] != 0)
3805 need_dict = 1;
3806 for (i = 1; i < 256; i++) {
3807 int l1, l2;
3808 if (decode[i] == 0
3809 #ifdef Py_UNICODE_WIDE
3810 || decode[i] > 0xFFFF
3811 #endif
3812 ) {
3813 need_dict = 1;
3814 break;
3815 }
3816 if (decode[i] == 0xFFFE)
3817 /* unmapped character */
3818 continue;
3819 l1 = decode[i] >> 11;
3820 l2 = decode[i] >> 7;
3821 if (level1[l1] == 0xFF)
3822 level1[l1] = count2++;
3823 if (level2[l2] == 0xFF)
3824 level2[l2] = count3++;
3825 }
3826
3827 if (count2 >= 0xFF || count3 >= 0xFF)
3828 need_dict = 1;
3829
3830 if (need_dict) {
3831 PyObject *result = PyDict_New();
3832 PyObject *key, *value;
3833 if (!result)
3834 return NULL;
3835 for (i = 0; i < 256; i++) {
3836 key = value = NULL;
3837 key = PyInt_FromLong(decode[i]);
3838 value = PyInt_FromLong(i);
3839 if (!key || !value)
3840 goto failed1;
3841 if (PyDict_SetItem(result, key, value) == -1)
3842 goto failed1;
3843 Py_DECREF(key);
3844 Py_DECREF(value);
3845 }
3846 return result;
3847 failed1:
3848 Py_XDECREF(key);
3849 Py_XDECREF(value);
3850 Py_DECREF(result);
3851 return NULL;
3852 }
3853
3854 /* Create a three-level trie */
3855 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3856 16*count2 + 128*count3 - 1);
3857 if (!result)
3858 return PyErr_NoMemory();
3859 PyObject_Init(result, &EncodingMapType);
3860 mresult = (struct encoding_map*)result;
3861 mresult->count2 = count2;
3862 mresult->count3 = count3;
3863 mlevel1 = mresult->level1;
3864 mlevel2 = mresult->level23;
3865 mlevel3 = mresult->level23 + 16*count2;
3866 memcpy(mlevel1, level1, 32);
3867 memset(mlevel2, 0xFF, 16*count2);
3868 memset(mlevel3, 0, 128*count3);
3869 count3 = 0;
3870 for (i = 1; i < 256; i++) {
3871 int o1, o2, o3, i2, i3;
3872 if (decode[i] == 0xFFFE)
3873 /* unmapped character */
3874 continue;
3875 o1 = decode[i]>>11;
3876 o2 = (decode[i]>>7) & 0xF;
3877 i2 = 16*mlevel1[o1] + o2;
3878 if (mlevel2[i2] == 0xFF)
3879 mlevel2[i2] = count3++;
3880 o3 = decode[i] & 0x7F;
3881 i3 = 128*mlevel2[i2] + o3;
3882 mlevel3[i3] = i;
3883 }
3884 return result;
3885}
3886
3887static int
3888encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3889{
3890 struct encoding_map *map = (struct encoding_map*)mapping;
3891 int l1 = c>>11;
3892 int l2 = (c>>7) & 0xF;
3893 int l3 = c & 0x7F;
3894 int i;
3895
3896#ifdef Py_UNICODE_WIDE
3897 if (c > 0xFFFF) {
3898 return -1;
3899 }
3900#endif
3901 if (c == 0)
3902 return 0;
3903 /* level 1*/
3904 i = map->level1[l1];
3905 if (i == 0xFF) {
3906 return -1;
3907 }
3908 /* level 2*/
3909 i = map->level23[16*i+l2];
3910 if (i == 0xFF) {
3911 return -1;
3912 }
3913 /* level 3 */
3914 i = map->level23[16*map->count2 + 128*i + l3];
3915 if (i == 0) {
3916 return -1;
3917 }
3918 return i;
3919}
3920
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003921/* Lookup the character ch in the mapping. If the character
3922 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003923 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003924static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003926 PyObject *w = PyInt_FromLong((long)c);
3927 PyObject *x;
3928
3929 if (w == NULL)
3930 return NULL;
3931 x = PyObject_GetItem(mapping, w);
3932 Py_DECREF(w);
3933 if (x == NULL) {
3934 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3935 /* No mapping found means: mapping is undefined. */
3936 PyErr_Clear();
3937 x = Py_None;
3938 Py_INCREF(x);
3939 return x;
3940 } else
3941 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003943 else if (x == Py_None)
3944 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945 else if (PyInt_Check(x)) {
3946 long value = PyInt_AS_LONG(x);
3947 if (value < 0 || value > 255) {
3948 PyErr_SetString(PyExc_TypeError,
3949 "character mapping must be in range(256)");
3950 Py_DECREF(x);
3951 return NULL;
3952 }
3953 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003955 else if (PyString_Check(x))
3956 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003959 PyErr_Format(PyExc_TypeError,
3960 "character mapping must return integer, None or str8, not %.400s",
3961 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962 Py_DECREF(x);
3963 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 }
3965}
3966
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003967static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003968charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003969{
Walter Dörwald827b0552007-05-12 13:23:53 +00003970 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003971 /* exponentially overallocate to minimize reallocations */
3972 if (requiredsize < 2*outsize)
3973 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003974 if (PyBytes_Resize(outobj, requiredsize)) {
3975 Py_DECREF(outobj);
3976 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003977 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003978 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003979}
3980
3981typedef enum charmapencode_result {
3982 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3983}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003985 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003986 space is available. Return a new reference to the object that
3987 was put in the output buffer, or Py_None, if the mapping was undefined
3988 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003989 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003991charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003992 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003993{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003994 PyObject *rep;
3995 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00003996 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003997
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003998 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003999 int res = encoding_map_lookup(c, mapping);
4000 Py_ssize_t requiredsize = *outpos+1;
4001 if (res == -1)
4002 return enc_FAILED;
4003 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004004 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004005 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004006 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004007 outstart[(*outpos)++] = (char)res;
4008 return enc_SUCCESS;
4009 }
4010
4011 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004012 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004013 return enc_EXCEPTION;
4014 else if (rep==Py_None) {
4015 Py_DECREF(rep);
4016 return enc_FAILED;
4017 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004018 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004019 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004020 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004021 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004022 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004023 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004024 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004025 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4027 }
4028 else {
4029 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004030 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4031 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004032 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004033 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004034 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004035 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004036 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004037 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038 memcpy(outstart + *outpos, repchars, repsize);
4039 *outpos += repsize;
4040 }
4041 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004042 Py_DECREF(rep);
4043 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044}
4045
4046/* handle an error in PyUnicode_EncodeCharmap
4047 Return 0 on success, -1 on error */
4048static
4049int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004050 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004051 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004052 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004053 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054{
4055 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004056 Py_ssize_t repsize;
4057 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 Py_UNICODE *uni2;
4059 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004060 Py_ssize_t collstartpos = *inpos;
4061 Py_ssize_t collendpos = *inpos+1;
4062 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063 char *encoding = "charmap";
4064 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004065 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004066
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067 /* find all unencodable characters */
4068 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004069 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004070 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004071 int res = encoding_map_lookup(p[collendpos], mapping);
4072 if (res != -1)
4073 break;
4074 ++collendpos;
4075 continue;
4076 }
4077
4078 rep = charmapencode_lookup(p[collendpos], mapping);
4079 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004081 else if (rep!=Py_None) {
4082 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083 break;
4084 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004085 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004086 ++collendpos;
4087 }
4088 /* cache callback name lookup
4089 * (if not done yet, i.e. it's the first error) */
4090 if (*known_errorHandler==-1) {
4091 if ((errors==NULL) || (!strcmp(errors, "strict")))
4092 *known_errorHandler = 1;
4093 else if (!strcmp(errors, "replace"))
4094 *known_errorHandler = 2;
4095 else if (!strcmp(errors, "ignore"))
4096 *known_errorHandler = 3;
4097 else if (!strcmp(errors, "xmlcharrefreplace"))
4098 *known_errorHandler = 4;
4099 else
4100 *known_errorHandler = 0;
4101 }
4102 switch (*known_errorHandler) {
4103 case 1: /* strict */
4104 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4105 return -1;
4106 case 2: /* replace */
4107 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4108 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004109 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004110 return -1;
4111 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004112 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004113 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4114 return -1;
4115 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004116 }
4117 /* fall through */
4118 case 3: /* ignore */
4119 *inpos = collendpos;
4120 break;
4121 case 4: /* xmlcharrefreplace */
4122 /* generate replacement (temporarily (mis)uses p) */
4123 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4124 char buffer[2+29+1+1];
4125 char *cp;
4126 sprintf(buffer, "&#%d;", (int)p[collpos]);
4127 for (cp = buffer; *cp; ++cp) {
4128 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004129 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004130 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004131 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004132 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4133 return -1;
4134 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135 }
4136 }
4137 *inpos = collendpos;
4138 break;
4139 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004140 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141 encoding, reason, p, size, exceptionObject,
4142 collstartpos, collendpos, &newpos);
4143 if (repunicode == NULL)
4144 return -1;
4145 /* generate replacement */
4146 repsize = PyUnicode_GET_SIZE(repunicode);
4147 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4148 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004149 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004150 return -1;
4151 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004152 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004153 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004154 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4155 return -1;
4156 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157 }
4158 *inpos = newpos;
4159 Py_DECREF(repunicode);
4160 }
4161 return 0;
4162}
4163
Guido van Rossumd57fd912000-03-10 22:53:23 +00004164PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004165 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166 PyObject *mapping,
4167 const char *errors)
4168{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004169 /* output object */
4170 PyObject *res = NULL;
4171 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004172 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004173 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004174 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175 PyObject *errorHandler = NULL;
4176 PyObject *exc = NULL;
4177 /* the following variable is used for caching string comparisons
4178 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4179 * 3=ignore, 4=xmlcharrefreplace */
4180 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181
4182 /* Default to Latin-1 */
4183 if (mapping == NULL)
4184 return PyUnicode_EncodeLatin1(p, size, errors);
4185
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004186 /* allocate enough for a simple encoding without
4187 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004188 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 if (res == NULL)
4190 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004191 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004193
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194 while (inpos<size) {
4195 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004196 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004197 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004199 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200 if (charmap_encoding_error(p, size, &inpos, mapping,
4201 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004202 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004203 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004204 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004206 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004207 else
4208 /* done with this character => adjust input position */
4209 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004211
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004212 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004213 if (respos<PyBytes_GET_SIZE(res)) {
4214 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004215 goto onError;
4216 }
4217 Py_XDECREF(exc);
4218 Py_XDECREF(errorHandler);
4219 return res;
4220
4221 onError:
4222 Py_XDECREF(res);
4223 Py_XDECREF(exc);
4224 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004225 return NULL;
4226}
4227
4228PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4229 PyObject *mapping)
4230{
4231 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4232 PyErr_BadArgument();
4233 return NULL;
4234 }
4235 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4236 PyUnicode_GET_SIZE(unicode),
4237 mapping,
4238 NULL);
4239}
4240
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241/* create or adjust a UnicodeTranslateError */
4242static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004243 const Py_UNICODE *unicode, Py_ssize_t size,
4244 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004246{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247 if (*exceptionObject == NULL) {
4248 *exceptionObject = PyUnicodeTranslateError_Create(
4249 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004250 }
4251 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4253 goto onError;
4254 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4255 goto onError;
4256 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4257 goto onError;
4258 return;
4259 onError:
4260 Py_DECREF(*exceptionObject);
4261 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004262 }
4263}
4264
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004265/* raises a UnicodeTranslateError */
4266static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004267 const Py_UNICODE *unicode, Py_ssize_t size,
4268 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004269 const char *reason)
4270{
4271 make_translate_exception(exceptionObject,
4272 unicode, size, startpos, endpos, reason);
4273 if (*exceptionObject != NULL)
4274 PyCodec_StrictErrors(*exceptionObject);
4275}
4276
4277/* error handling callback helper:
4278 build arguments, call the callback and check the arguments,
4279 put the result into newpos and return the replacement string, which
4280 has to be freed by the caller */
4281static PyObject *unicode_translate_call_errorhandler(const char *errors,
4282 PyObject **errorHandler,
4283 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004284 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4285 Py_ssize_t startpos, Py_ssize_t endpos,
4286 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004287{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004288 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004290 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291 PyObject *restuple;
4292 PyObject *resunicode;
4293
4294 if (*errorHandler == NULL) {
4295 *errorHandler = PyCodec_LookupError(errors);
4296 if (*errorHandler == NULL)
4297 return NULL;
4298 }
4299
4300 make_translate_exception(exceptionObject,
4301 unicode, size, startpos, endpos, reason);
4302 if (*exceptionObject == NULL)
4303 return NULL;
4304
4305 restuple = PyObject_CallFunctionObjArgs(
4306 *errorHandler, *exceptionObject, NULL);
4307 if (restuple == NULL)
4308 return NULL;
4309 if (!PyTuple_Check(restuple)) {
4310 PyErr_Format(PyExc_TypeError, &argparse[4]);
4311 Py_DECREF(restuple);
4312 return NULL;
4313 }
4314 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004315 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004316 Py_DECREF(restuple);
4317 return NULL;
4318 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004319 if (i_newpos<0)
4320 *newpos = size+i_newpos;
4321 else
4322 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004323 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004324 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004325 Py_DECREF(restuple);
4326 return NULL;
4327 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004328 Py_INCREF(resunicode);
4329 Py_DECREF(restuple);
4330 return resunicode;
4331}
4332
4333/* Lookup the character ch in the mapping and put the result in result,
4334 which must be decrefed by the caller.
4335 Return 0 on success, -1 on error */
4336static
4337int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4338{
4339 PyObject *w = PyInt_FromLong((long)c);
4340 PyObject *x;
4341
4342 if (w == NULL)
4343 return -1;
4344 x = PyObject_GetItem(mapping, w);
4345 Py_DECREF(w);
4346 if (x == NULL) {
4347 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4348 /* No mapping found means: use 1:1 mapping. */
4349 PyErr_Clear();
4350 *result = NULL;
4351 return 0;
4352 } else
4353 return -1;
4354 }
4355 else if (x == Py_None) {
4356 *result = x;
4357 return 0;
4358 }
4359 else if (PyInt_Check(x)) {
4360 long value = PyInt_AS_LONG(x);
4361 long max = PyUnicode_GetMax();
4362 if (value < 0 || value > max) {
4363 PyErr_Format(PyExc_TypeError,
4364 "character mapping must be in range(0x%lx)", max+1);
4365 Py_DECREF(x);
4366 return -1;
4367 }
4368 *result = x;
4369 return 0;
4370 }
4371 else if (PyUnicode_Check(x)) {
4372 *result = x;
4373 return 0;
4374 }
4375 else {
4376 /* wrong return value */
4377 PyErr_SetString(PyExc_TypeError,
4378 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004379 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380 return -1;
4381 }
4382}
4383/* ensure that *outobj is at least requiredsize characters long,
4384if not reallocate and adjust various state variables.
4385Return 0 on success, -1 on error */
4386static
Walter Dörwald4894c302003-10-24 14:25:28 +00004387int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004388 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004390 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004391 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004392 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004393 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004395 if (requiredsize < 2 * oldsize)
4396 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004397 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398 return -1;
4399 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 }
4401 return 0;
4402}
4403/* lookup the character, put the result in the output string and adjust
4404 various state variables. Return a new reference to the object that
4405 was put in the output buffer in *result, or Py_None, if the mapping was
4406 undefined (in which case no character was written).
4407 The called must decref result.
4408 Return 0 on success, -1 on error. */
4409static
Walter Dörwald4894c302003-10-24 14:25:28 +00004410int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004411 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004412 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413{
Walter Dörwald4894c302003-10-24 14:25:28 +00004414 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004415 return -1;
4416 if (*res==NULL) {
4417 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004418 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419 }
4420 else if (*res==Py_None)
4421 ;
4422 else if (PyInt_Check(*res)) {
4423 /* no overflow check, because we know that the space is enough */
4424 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4425 }
4426 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004427 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428 if (repsize==1) {
4429 /* no overflow check, because we know that the space is enough */
4430 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4431 }
4432 else if (repsize!=0) {
4433 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004434 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004435 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004436 repsize - 1;
4437 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 return -1;
4439 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4440 *outp += repsize;
4441 }
4442 }
4443 else
4444 return -1;
4445 return 0;
4446}
4447
4448PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004449 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 PyObject *mapping,
4451 const char *errors)
4452{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453 /* output object */
4454 PyObject *res = NULL;
4455 /* pointers to the beginning and end+1 of input */
4456 const Py_UNICODE *startp = p;
4457 const Py_UNICODE *endp = p + size;
4458 /* pointer into the output */
4459 Py_UNICODE *str;
4460 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004461 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004462 char *reason = "character maps to <undefined>";
4463 PyObject *errorHandler = NULL;
4464 PyObject *exc = NULL;
4465 /* the following variable is used for caching string comparisons
4466 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4467 * 3=ignore, 4=xmlcharrefreplace */
4468 int known_errorHandler = -1;
4469
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470 if (mapping == NULL) {
4471 PyErr_BadArgument();
4472 return NULL;
4473 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004474
4475 /* allocate enough for a simple 1:1 translation without
4476 replacements, if we need more, we'll resize */
4477 res = PyUnicode_FromUnicode(NULL, size);
4478 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004479 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481 return res;
4482 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004483
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004484 while (p<endp) {
4485 /* try to encode it */
4486 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004487 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004488 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 goto onError;
4490 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004491 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004492 if (x!=Py_None) /* it worked => adjust input pointer */
4493 ++p;
4494 else { /* untranslatable character */
4495 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004496 Py_ssize_t repsize;
4497 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004498 Py_UNICODE *uni2;
4499 /* startpos for collecting untranslatable chars */
4500 const Py_UNICODE *collstart = p;
4501 const Py_UNICODE *collend = p+1;
4502 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504 /* find all untranslatable characters */
4505 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004506 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004507 goto onError;
4508 Py_XDECREF(x);
4509 if (x!=Py_None)
4510 break;
4511 ++collend;
4512 }
4513 /* cache callback name lookup
4514 * (if not done yet, i.e. it's the first error) */
4515 if (known_errorHandler==-1) {
4516 if ((errors==NULL) || (!strcmp(errors, "strict")))
4517 known_errorHandler = 1;
4518 else if (!strcmp(errors, "replace"))
4519 known_errorHandler = 2;
4520 else if (!strcmp(errors, "ignore"))
4521 known_errorHandler = 3;
4522 else if (!strcmp(errors, "xmlcharrefreplace"))
4523 known_errorHandler = 4;
4524 else
4525 known_errorHandler = 0;
4526 }
4527 switch (known_errorHandler) {
4528 case 1: /* strict */
4529 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4530 goto onError;
4531 case 2: /* replace */
4532 /* No need to check for space, this is a 1:1 replacement */
4533 for (coll = collstart; coll<collend; ++coll)
4534 *str++ = '?';
4535 /* fall through */
4536 case 3: /* ignore */
4537 p = collend;
4538 break;
4539 case 4: /* xmlcharrefreplace */
4540 /* generate replacement (temporarily (mis)uses p) */
4541 for (p = collstart; p < collend; ++p) {
4542 char buffer[2+29+1+1];
4543 char *cp;
4544 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004545 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4547 goto onError;
4548 for (cp = buffer; *cp; ++cp)
4549 *str++ = *cp;
4550 }
4551 p = collend;
4552 break;
4553 default:
4554 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4555 reason, startp, size, &exc,
4556 collstart-startp, collend-startp, &newpos);
4557 if (repunicode == NULL)
4558 goto onError;
4559 /* generate replacement */
4560 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004561 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4563 Py_DECREF(repunicode);
4564 goto onError;
4565 }
4566 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4567 *str++ = *uni2;
4568 p = startp + newpos;
4569 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570 }
4571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573 /* Resize if we allocated to much */
4574 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004575 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004576 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004577 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578 }
4579 Py_XDECREF(exc);
4580 Py_XDECREF(errorHandler);
4581 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 onError:
4584 Py_XDECREF(res);
4585 Py_XDECREF(exc);
4586 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587 return NULL;
4588}
4589
4590PyObject *PyUnicode_Translate(PyObject *str,
4591 PyObject *mapping,
4592 const char *errors)
4593{
4594 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004595
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596 str = PyUnicode_FromObject(str);
4597 if (str == NULL)
4598 goto onError;
4599 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4600 PyUnicode_GET_SIZE(str),
4601 mapping,
4602 errors);
4603 Py_DECREF(str);
4604 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004605
Guido van Rossumd57fd912000-03-10 22:53:23 +00004606 onError:
4607 Py_XDECREF(str);
4608 return NULL;
4609}
Tim Petersced69f82003-09-16 20:30:58 +00004610
Guido van Rossum9e896b32000-04-05 20:11:21 +00004611/* --- Decimal Encoder ---------------------------------------------------- */
4612
4613int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004614 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004615 char *output,
4616 const char *errors)
4617{
4618 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004619 PyObject *errorHandler = NULL;
4620 PyObject *exc = NULL;
4621 const char *encoding = "decimal";
4622 const char *reason = "invalid decimal Unicode string";
4623 /* the following variable is used for caching string comparisons
4624 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4625 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004626
4627 if (output == NULL) {
4628 PyErr_BadArgument();
4629 return -1;
4630 }
4631
4632 p = s;
4633 end = s + length;
4634 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004635 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004636 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004637 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004638 Py_ssize_t repsize;
4639 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640 Py_UNICODE *uni2;
4641 Py_UNICODE *collstart;
4642 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004643
Guido van Rossum9e896b32000-04-05 20:11:21 +00004644 if (Py_UNICODE_ISSPACE(ch)) {
4645 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004647 continue;
4648 }
4649 decimal = Py_UNICODE_TODECIMAL(ch);
4650 if (decimal >= 0) {
4651 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004653 continue;
4654 }
Guido van Rossumba477042000-04-06 18:18:10 +00004655 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004656 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004658 continue;
4659 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004660 /* All other characters are considered unencodable */
4661 collstart = p;
4662 collend = p+1;
4663 while (collend < end) {
4664 if ((0 < *collend && *collend < 256) ||
4665 !Py_UNICODE_ISSPACE(*collend) ||
4666 Py_UNICODE_TODECIMAL(*collend))
4667 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004668 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004669 /* cache callback name lookup
4670 * (if not done yet, i.e. it's the first error) */
4671 if (known_errorHandler==-1) {
4672 if ((errors==NULL) || (!strcmp(errors, "strict")))
4673 known_errorHandler = 1;
4674 else if (!strcmp(errors, "replace"))
4675 known_errorHandler = 2;
4676 else if (!strcmp(errors, "ignore"))
4677 known_errorHandler = 3;
4678 else if (!strcmp(errors, "xmlcharrefreplace"))
4679 known_errorHandler = 4;
4680 else
4681 known_errorHandler = 0;
4682 }
4683 switch (known_errorHandler) {
4684 case 1: /* strict */
4685 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4686 goto onError;
4687 case 2: /* replace */
4688 for (p = collstart; p < collend; ++p)
4689 *output++ = '?';
4690 /* fall through */
4691 case 3: /* ignore */
4692 p = collend;
4693 break;
4694 case 4: /* xmlcharrefreplace */
4695 /* generate replacement (temporarily (mis)uses p) */
4696 for (p = collstart; p < collend; ++p)
4697 output += sprintf(output, "&#%d;", (int)*p);
4698 p = collend;
4699 break;
4700 default:
4701 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4702 encoding, reason, s, length, &exc,
4703 collstart-s, collend-s, &newpos);
4704 if (repunicode == NULL)
4705 goto onError;
4706 /* generate replacement */
4707 repsize = PyUnicode_GET_SIZE(repunicode);
4708 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4709 Py_UNICODE ch = *uni2;
4710 if (Py_UNICODE_ISSPACE(ch))
4711 *output++ = ' ';
4712 else {
4713 decimal = Py_UNICODE_TODECIMAL(ch);
4714 if (decimal >= 0)
4715 *output++ = '0' + decimal;
4716 else if (0 < ch && ch < 256)
4717 *output++ = (char)ch;
4718 else {
4719 Py_DECREF(repunicode);
4720 raise_encode_exception(&exc, encoding,
4721 s, length, collstart-s, collend-s, reason);
4722 goto onError;
4723 }
4724 }
4725 }
4726 p = s + newpos;
4727 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004728 }
4729 }
4730 /* 0-terminate the output string */
4731 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004732 Py_XDECREF(exc);
4733 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004734 return 0;
4735
4736 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004737 Py_XDECREF(exc);
4738 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004739 return -1;
4740}
4741
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742/* --- Helpers ------------------------------------------------------------ */
4743
Thomas Wouters477c8d52006-05-27 19:21:47 +00004744#define STRINGLIB_CHAR Py_UNICODE
4745
4746#define STRINGLIB_LEN PyUnicode_GET_SIZE
4747#define STRINGLIB_NEW PyUnicode_FromUnicode
4748#define STRINGLIB_STR PyUnicode_AS_UNICODE
4749
4750Py_LOCAL_INLINE(int)
4751STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004753 if (str[0] != other[0])
4754 return 1;
4755 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756}
4757
Thomas Wouters477c8d52006-05-27 19:21:47 +00004758#define STRINGLIB_EMPTY unicode_empty
4759
4760#include "stringlib/fastsearch.h"
4761
4762#include "stringlib/count.h"
4763#include "stringlib/find.h"
4764#include "stringlib/partition.h"
4765
4766/* helper macro to fixup start/end slice values */
4767#define FIX_START_END(obj) \
4768 if (start < 0) \
4769 start += (obj)->length; \
4770 if (start < 0) \
4771 start = 0; \
4772 if (end > (obj)->length) \
4773 end = (obj)->length; \
4774 if (end < 0) \
4775 end += (obj)->length; \
4776 if (end < 0) \
4777 end = 0;
4778
Martin v. Löwis18e16552006-02-15 17:27:45 +00004779Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004780 PyObject *substr,
4781 Py_ssize_t start,
4782 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004784 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004785 PyUnicodeObject* str_obj;
4786 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004787
Thomas Wouters477c8d52006-05-27 19:21:47 +00004788 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4789 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004791 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4792 if (!sub_obj) {
4793 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794 return -1;
4795 }
Tim Petersced69f82003-09-16 20:30:58 +00004796
Thomas Wouters477c8d52006-05-27 19:21:47 +00004797 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004798
Thomas Wouters477c8d52006-05-27 19:21:47 +00004799 result = stringlib_count(
4800 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4801 );
4802
4803 Py_DECREF(sub_obj);
4804 Py_DECREF(str_obj);
4805
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806 return result;
4807}
4808
Martin v. Löwis18e16552006-02-15 17:27:45 +00004809Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004810 PyObject *sub,
4811 Py_ssize_t start,
4812 Py_ssize_t end,
4813 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004815 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004816
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004818 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004819 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004820 sub = PyUnicode_FromObject(sub);
4821 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004822 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004823 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824 }
Tim Petersced69f82003-09-16 20:30:58 +00004825
Thomas Wouters477c8d52006-05-27 19:21:47 +00004826 if (direction > 0)
4827 result = stringlib_find_slice(
4828 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4829 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4830 start, end
4831 );
4832 else
4833 result = stringlib_rfind_slice(
4834 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4835 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4836 start, end
4837 );
4838
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004840 Py_DECREF(sub);
4841
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 return result;
4843}
4844
Tim Petersced69f82003-09-16 20:30:58 +00004845static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846int tailmatch(PyUnicodeObject *self,
4847 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004848 Py_ssize_t start,
4849 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850 int direction)
4851{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852 if (substring->length == 0)
4853 return 1;
4854
Thomas Wouters477c8d52006-05-27 19:21:47 +00004855 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856
4857 end -= substring->length;
4858 if (end < start)
4859 return 0;
4860
4861 if (direction > 0) {
4862 if (Py_UNICODE_MATCH(self, end, substring))
4863 return 1;
4864 } else {
4865 if (Py_UNICODE_MATCH(self, start, substring))
4866 return 1;
4867 }
4868
4869 return 0;
4870}
4871
Martin v. Löwis18e16552006-02-15 17:27:45 +00004872Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004874 Py_ssize_t start,
4875 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 int direction)
4877{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004878 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004879
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 str = PyUnicode_FromObject(str);
4881 if (str == NULL)
4882 return -1;
4883 substr = PyUnicode_FromObject(substr);
4884 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004885 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886 return -1;
4887 }
Tim Petersced69f82003-09-16 20:30:58 +00004888
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889 result = tailmatch((PyUnicodeObject *)str,
4890 (PyUnicodeObject *)substr,
4891 start, end, direction);
4892 Py_DECREF(str);
4893 Py_DECREF(substr);
4894 return result;
4895}
4896
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897/* Apply fixfct filter to the Unicode object self and return a
4898 reference to the modified object */
4899
Tim Petersced69f82003-09-16 20:30:58 +00004900static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901PyObject *fixup(PyUnicodeObject *self,
4902 int (*fixfct)(PyUnicodeObject *s))
4903{
4904
4905 PyUnicodeObject *u;
4906
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004907 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908 if (u == NULL)
4909 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004910
4911 Py_UNICODE_COPY(u->str, self->str, self->length);
4912
Tim Peters7a29bd52001-09-12 03:03:31 +00004913 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914 /* fixfct should return TRUE if it modified the buffer. If
4915 FALSE, return a reference to the original buffer instead
4916 (to save space, not time) */
4917 Py_INCREF(self);
4918 Py_DECREF(u);
4919 return (PyObject*) self;
4920 }
4921 return (PyObject*) u;
4922}
4923
Tim Petersced69f82003-09-16 20:30:58 +00004924static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925int fixupper(PyUnicodeObject *self)
4926{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004927 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928 Py_UNICODE *s = self->str;
4929 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004930
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931 while (len-- > 0) {
4932 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004933
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 ch = Py_UNICODE_TOUPPER(*s);
4935 if (ch != *s) {
4936 status = 1;
4937 *s = ch;
4938 }
4939 s++;
4940 }
4941
4942 return status;
4943}
4944
Tim Petersced69f82003-09-16 20:30:58 +00004945static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946int fixlower(PyUnicodeObject *self)
4947{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004948 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949 Py_UNICODE *s = self->str;
4950 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004951
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952 while (len-- > 0) {
4953 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004954
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955 ch = Py_UNICODE_TOLOWER(*s);
4956 if (ch != *s) {
4957 status = 1;
4958 *s = ch;
4959 }
4960 s++;
4961 }
4962
4963 return status;
4964}
4965
Tim Petersced69f82003-09-16 20:30:58 +00004966static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004967int fixswapcase(PyUnicodeObject *self)
4968{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004969 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970 Py_UNICODE *s = self->str;
4971 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004972
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973 while (len-- > 0) {
4974 if (Py_UNICODE_ISUPPER(*s)) {
4975 *s = Py_UNICODE_TOLOWER(*s);
4976 status = 1;
4977 } else if (Py_UNICODE_ISLOWER(*s)) {
4978 *s = Py_UNICODE_TOUPPER(*s);
4979 status = 1;
4980 }
4981 s++;
4982 }
4983
4984 return status;
4985}
4986
Tim Petersced69f82003-09-16 20:30:58 +00004987static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988int fixcapitalize(PyUnicodeObject *self)
4989{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004990 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004991 Py_UNICODE *s = self->str;
4992 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004993
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004994 if (len == 0)
4995 return 0;
4996 if (Py_UNICODE_ISLOWER(*s)) {
4997 *s = Py_UNICODE_TOUPPER(*s);
4998 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004999 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005000 s++;
5001 while (--len > 0) {
5002 if (Py_UNICODE_ISUPPER(*s)) {
5003 *s = Py_UNICODE_TOLOWER(*s);
5004 status = 1;
5005 }
5006 s++;
5007 }
5008 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005009}
5010
5011static
5012int fixtitle(PyUnicodeObject *self)
5013{
5014 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5015 register Py_UNICODE *e;
5016 int previous_is_cased;
5017
5018 /* Shortcut for single character strings */
5019 if (PyUnicode_GET_SIZE(self) == 1) {
5020 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5021 if (*p != ch) {
5022 *p = ch;
5023 return 1;
5024 }
5025 else
5026 return 0;
5027 }
Tim Petersced69f82003-09-16 20:30:58 +00005028
Guido van Rossumd57fd912000-03-10 22:53:23 +00005029 e = p + PyUnicode_GET_SIZE(self);
5030 previous_is_cased = 0;
5031 for (; p < e; p++) {
5032 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005033
Guido van Rossumd57fd912000-03-10 22:53:23 +00005034 if (previous_is_cased)
5035 *p = Py_UNICODE_TOLOWER(ch);
5036 else
5037 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005038
5039 if (Py_UNICODE_ISLOWER(ch) ||
5040 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041 Py_UNICODE_ISTITLE(ch))
5042 previous_is_cased = 1;
5043 else
5044 previous_is_cased = 0;
5045 }
5046 return 1;
5047}
5048
Tim Peters8ce9f162004-08-27 01:49:32 +00005049PyObject *
5050PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005051{
Tim Peters8ce9f162004-08-27 01:49:32 +00005052 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005053 const Py_UNICODE blank = ' ';
5054 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005055 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005056 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005057 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5058 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005059 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5060 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005061 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005062 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005063 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064
Tim Peters05eba1f2004-08-27 21:32:02 +00005065 fseq = PySequence_Fast(seq, "");
5066 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005067 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005068 }
5069
Tim Peters91879ab2004-08-27 22:35:44 +00005070 /* Grrrr. A codec may be invoked to convert str objects to
5071 * Unicode, and so it's possible to call back into Python code
5072 * during PyUnicode_FromObject(), and so it's possible for a sick
5073 * codec to change the size of fseq (if seq is a list). Therefore
5074 * we have to keep refetching the size -- can't assume seqlen
5075 * is invariant.
5076 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005077 seqlen = PySequence_Fast_GET_SIZE(fseq);
5078 /* If empty sequence, return u"". */
5079 if (seqlen == 0) {
5080 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5081 goto Done;
5082 }
5083 /* If singleton sequence with an exact Unicode, return that. */
5084 if (seqlen == 1) {
5085 item = PySequence_Fast_GET_ITEM(fseq, 0);
5086 if (PyUnicode_CheckExact(item)) {
5087 Py_INCREF(item);
5088 res = (PyUnicodeObject *)item;
5089 goto Done;
5090 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005091 }
5092
Tim Peters05eba1f2004-08-27 21:32:02 +00005093 /* At least two items to join, or one that isn't exact Unicode. */
5094 if (seqlen > 1) {
5095 /* Set up sep and seplen -- they're needed. */
5096 if (separator == NULL) {
5097 sep = &blank;
5098 seplen = 1;
5099 }
5100 else {
5101 internal_separator = PyUnicode_FromObject(separator);
5102 if (internal_separator == NULL)
5103 goto onError;
5104 sep = PyUnicode_AS_UNICODE(internal_separator);
5105 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005106 /* In case PyUnicode_FromObject() mutated seq. */
5107 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005108 }
5109 }
5110
5111 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005112 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005113 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005114 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005115 res_p = PyUnicode_AS_UNICODE(res);
5116 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005117
Tim Peters05eba1f2004-08-27 21:32:02 +00005118 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005119 Py_ssize_t itemlen;
5120 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005121
5122 item = PySequence_Fast_GET_ITEM(fseq, i);
5123 /* Convert item to Unicode. */
5124 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5125 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005126 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005127 " %.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00005128 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005129 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005130 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005131 item = PyUnicode_FromObject(item);
5132 if (item == NULL)
5133 goto onError;
5134 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005135
Tim Peters91879ab2004-08-27 22:35:44 +00005136 /* In case PyUnicode_FromObject() mutated seq. */
5137 seqlen = PySequence_Fast_GET_SIZE(fseq);
5138
Tim Peters8ce9f162004-08-27 01:49:32 +00005139 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005141 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005142 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005143 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005144 if (i < seqlen - 1) {
5145 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005146 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005147 goto Overflow;
5148 }
5149 if (new_res_used > res_alloc) {
5150 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005151 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005152 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005153 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005154 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005155 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005156 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005157 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005159 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005160 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005162
5163 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005164 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005165 res_p += itemlen;
5166 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005167 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005168 res_p += seplen;
5169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005171 res_used = new_res_used;
5172 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005173
Tim Peters05eba1f2004-08-27 21:32:02 +00005174 /* Shrink res to match the used area; this probably can't fail,
5175 * but it's cheap to check.
5176 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005177 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005178 goto onError;
5179
5180 Done:
5181 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005182 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 return (PyObject *)res;
5184
Tim Peters8ce9f162004-08-27 01:49:32 +00005185 Overflow:
5186 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005187 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005188 Py_DECREF(item);
5189 /* fall through */
5190
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005192 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005193 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005194 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195 return NULL;
5196}
5197
Tim Petersced69f82003-09-16 20:30:58 +00005198static
5199PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005200 Py_ssize_t left,
5201 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 Py_UNICODE fill)
5203{
5204 PyUnicodeObject *u;
5205
5206 if (left < 0)
5207 left = 0;
5208 if (right < 0)
5209 right = 0;
5210
Tim Peters7a29bd52001-09-12 03:03:31 +00005211 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212 Py_INCREF(self);
5213 return self;
5214 }
5215
5216 u = _PyUnicode_New(left + self->length + right);
5217 if (u) {
5218 if (left)
5219 Py_UNICODE_FILL(u->str, fill, left);
5220 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5221 if (right)
5222 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5223 }
5224
5225 return u;
5226}
5227
5228#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005229 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230 if (!str) \
5231 goto onError; \
5232 if (PyList_Append(list, str)) { \
5233 Py_DECREF(str); \
5234 goto onError; \
5235 } \
5236 else \
5237 Py_DECREF(str);
5238
5239static
5240PyObject *split_whitespace(PyUnicodeObject *self,
5241 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005242 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005244 register Py_ssize_t i;
5245 register Py_ssize_t j;
5246 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247 PyObject *str;
5248
5249 for (i = j = 0; i < len; ) {
5250 /* find a token */
5251 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5252 i++;
5253 j = i;
5254 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5255 i++;
5256 if (j < i) {
5257 if (maxcount-- <= 0)
5258 break;
5259 SPLIT_APPEND(self->str, j, i);
5260 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5261 i++;
5262 j = i;
5263 }
5264 }
5265 if (j < len) {
5266 SPLIT_APPEND(self->str, j, len);
5267 }
5268 return list;
5269
5270 onError:
5271 Py_DECREF(list);
5272 return NULL;
5273}
5274
5275PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005276 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005278 register Py_ssize_t i;
5279 register Py_ssize_t j;
5280 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281 PyObject *list;
5282 PyObject *str;
5283 Py_UNICODE *data;
5284
5285 string = PyUnicode_FromObject(string);
5286 if (string == NULL)
5287 return NULL;
5288 data = PyUnicode_AS_UNICODE(string);
5289 len = PyUnicode_GET_SIZE(string);
5290
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291 list = PyList_New(0);
5292 if (!list)
5293 goto onError;
5294
5295 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005296 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005297
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005299 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301
5302 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005303 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 if (i < len) {
5305 if (data[i] == '\r' && i + 1 < len &&
5306 data[i+1] == '\n')
5307 i += 2;
5308 else
5309 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005310 if (keepends)
5311 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312 }
Guido van Rossum86662912000-04-11 15:38:46 +00005313 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314 j = i;
5315 }
5316 if (j < len) {
5317 SPLIT_APPEND(data, j, len);
5318 }
5319
5320 Py_DECREF(string);
5321 return list;
5322
5323 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005324 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325 Py_DECREF(string);
5326 return NULL;
5327}
5328
Tim Petersced69f82003-09-16 20:30:58 +00005329static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330PyObject *split_char(PyUnicodeObject *self,
5331 PyObject *list,
5332 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005333 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005335 register Py_ssize_t i;
5336 register Py_ssize_t j;
5337 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 PyObject *str;
5339
5340 for (i = j = 0; i < len; ) {
5341 if (self->str[i] == ch) {
5342 if (maxcount-- <= 0)
5343 break;
5344 SPLIT_APPEND(self->str, j, i);
5345 i = j = i + 1;
5346 } else
5347 i++;
5348 }
5349 if (j <= len) {
5350 SPLIT_APPEND(self->str, j, len);
5351 }
5352 return list;
5353
5354 onError:
5355 Py_DECREF(list);
5356 return NULL;
5357}
5358
Tim Petersced69f82003-09-16 20:30:58 +00005359static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360PyObject *split_substring(PyUnicodeObject *self,
5361 PyObject *list,
5362 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005363 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005365 register Py_ssize_t i;
5366 register Py_ssize_t j;
5367 Py_ssize_t len = self->length;
5368 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 PyObject *str;
5370
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005371 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 if (Py_UNICODE_MATCH(self, i, substring)) {
5373 if (maxcount-- <= 0)
5374 break;
5375 SPLIT_APPEND(self->str, j, i);
5376 i = j = i + sublen;
5377 } else
5378 i++;
5379 }
5380 if (j <= len) {
5381 SPLIT_APPEND(self->str, j, len);
5382 }
5383 return list;
5384
5385 onError:
5386 Py_DECREF(list);
5387 return NULL;
5388}
5389
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005390static
5391PyObject *rsplit_whitespace(PyUnicodeObject *self,
5392 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005393 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005394{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005395 register Py_ssize_t i;
5396 register Py_ssize_t j;
5397 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005398 PyObject *str;
5399
5400 for (i = j = len - 1; i >= 0; ) {
5401 /* find a token */
5402 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5403 i--;
5404 j = i;
5405 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5406 i--;
5407 if (j > i) {
5408 if (maxcount-- <= 0)
5409 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005410 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005411 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5412 i--;
5413 j = i;
5414 }
5415 }
5416 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005417 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005418 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005419 if (PyList_Reverse(list) < 0)
5420 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005421 return list;
5422
5423 onError:
5424 Py_DECREF(list);
5425 return NULL;
5426}
5427
5428static
5429PyObject *rsplit_char(PyUnicodeObject *self,
5430 PyObject *list,
5431 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005432 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005433{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005434 register Py_ssize_t i;
5435 register Py_ssize_t j;
5436 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005437 PyObject *str;
5438
5439 for (i = j = len - 1; i >= 0; ) {
5440 if (self->str[i] == ch) {
5441 if (maxcount-- <= 0)
5442 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005443 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005444 j = i = i - 1;
5445 } else
5446 i--;
5447 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005448 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005449 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005450 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005451 if (PyList_Reverse(list) < 0)
5452 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005453 return list;
5454
5455 onError:
5456 Py_DECREF(list);
5457 return NULL;
5458}
5459
5460static
5461PyObject *rsplit_substring(PyUnicodeObject *self,
5462 PyObject *list,
5463 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005464 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005465{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005466 register Py_ssize_t i;
5467 register Py_ssize_t j;
5468 Py_ssize_t len = self->length;
5469 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005470 PyObject *str;
5471
5472 for (i = len - sublen, j = len; i >= 0; ) {
5473 if (Py_UNICODE_MATCH(self, i, substring)) {
5474 if (maxcount-- <= 0)
5475 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005476 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005477 j = i;
5478 i -= sublen;
5479 } else
5480 i--;
5481 }
5482 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005483 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005484 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005485 if (PyList_Reverse(list) < 0)
5486 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005487 return list;
5488
5489 onError:
5490 Py_DECREF(list);
5491 return NULL;
5492}
5493
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494#undef SPLIT_APPEND
5495
5496static
5497PyObject *split(PyUnicodeObject *self,
5498 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005499 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500{
5501 PyObject *list;
5502
5503 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005504 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505
5506 list = PyList_New(0);
5507 if (!list)
5508 return NULL;
5509
5510 if (substring == NULL)
5511 return split_whitespace(self,list,maxcount);
5512
5513 else if (substring->length == 1)
5514 return split_char(self,list,substring->str[0],maxcount);
5515
5516 else if (substring->length == 0) {
5517 Py_DECREF(list);
5518 PyErr_SetString(PyExc_ValueError, "empty separator");
5519 return NULL;
5520 }
5521 else
5522 return split_substring(self,list,substring,maxcount);
5523}
5524
Tim Petersced69f82003-09-16 20:30:58 +00005525static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005526PyObject *rsplit(PyUnicodeObject *self,
5527 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005528 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005529{
5530 PyObject *list;
5531
5532 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005533 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005534
5535 list = PyList_New(0);
5536 if (!list)
5537 return NULL;
5538
5539 if (substring == NULL)
5540 return rsplit_whitespace(self,list,maxcount);
5541
5542 else if (substring->length == 1)
5543 return rsplit_char(self,list,substring->str[0],maxcount);
5544
5545 else if (substring->length == 0) {
5546 Py_DECREF(list);
5547 PyErr_SetString(PyExc_ValueError, "empty separator");
5548 return NULL;
5549 }
5550 else
5551 return rsplit_substring(self,list,substring,maxcount);
5552}
5553
5554static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555PyObject *replace(PyUnicodeObject *self,
5556 PyUnicodeObject *str1,
5557 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005558 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559{
5560 PyUnicodeObject *u;
5561
5562 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005563 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564
Thomas Wouters477c8d52006-05-27 19:21:47 +00005565 if (str1->length == str2->length) {
5566 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005567 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005568 if (str1->length == 1) {
5569 /* replace characters */
5570 Py_UNICODE u1, u2;
5571 if (!findchar(self->str, self->length, str1->str[0]))
5572 goto nothing;
5573 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5574 if (!u)
5575 return NULL;
5576 Py_UNICODE_COPY(u->str, self->str, self->length);
5577 u1 = str1->str[0];
5578 u2 = str2->str[0];
5579 for (i = 0; i < u->length; i++)
5580 if (u->str[i] == u1) {
5581 if (--maxcount < 0)
5582 break;
5583 u->str[i] = u2;
5584 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005586 i = fastsearch(
5587 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005589 if (i < 0)
5590 goto nothing;
5591 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5592 if (!u)
5593 return NULL;
5594 Py_UNICODE_COPY(u->str, self->str, self->length);
5595 while (i <= self->length - str1->length)
5596 if (Py_UNICODE_MATCH(self, i, str1)) {
5597 if (--maxcount < 0)
5598 break;
5599 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5600 i += str1->length;
5601 } else
5602 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005605
5606 Py_ssize_t n, i, j, e;
5607 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608 Py_UNICODE *p;
5609
5610 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005611 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 if (n > maxcount)
5613 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005614 if (n == 0)
5615 goto nothing;
5616 /* new_size = self->length + n * (str2->length - str1->length)); */
5617 delta = (str2->length - str1->length);
5618 if (delta == 0) {
5619 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005621 product = n * (str2->length - str1->length);
5622 if ((product / (str2->length - str1->length)) != n) {
5623 PyErr_SetString(PyExc_OverflowError,
5624 "replace string is too long");
5625 return NULL;
5626 }
5627 new_size = self->length + product;
5628 if (new_size < 0) {
5629 PyErr_SetString(PyExc_OverflowError,
5630 "replace string is too long");
5631 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 }
5633 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005634 u = _PyUnicode_New(new_size);
5635 if (!u)
5636 return NULL;
5637 i = 0;
5638 p = u->str;
5639 e = self->length - str1->length;
5640 if (str1->length > 0) {
5641 while (n-- > 0) {
5642 /* look for next match */
5643 j = i;
5644 while (j <= e) {
5645 if (Py_UNICODE_MATCH(self, j, str1))
5646 break;
5647 j++;
5648 }
5649 if (j > i) {
5650 if (j > e)
5651 break;
5652 /* copy unchanged part [i:j] */
5653 Py_UNICODE_COPY(p, self->str+i, j-i);
5654 p += j - i;
5655 }
5656 /* copy substitution string */
5657 if (str2->length > 0) {
5658 Py_UNICODE_COPY(p, str2->str, str2->length);
5659 p += str2->length;
5660 }
5661 i = j + str1->length;
5662 }
5663 if (i < self->length)
5664 /* copy tail [i:] */
5665 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5666 } else {
5667 /* interleave */
5668 while (n > 0) {
5669 Py_UNICODE_COPY(p, str2->str, str2->length);
5670 p += str2->length;
5671 if (--n <= 0)
5672 break;
5673 *p++ = self->str[i++];
5674 }
5675 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5676 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005679
5680nothing:
5681 /* nothing to replace; return original string (when possible) */
5682 if (PyUnicode_CheckExact(self)) {
5683 Py_INCREF(self);
5684 return (PyObject *) self;
5685 }
5686 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687}
5688
5689/* --- Unicode Object Methods --------------------------------------------- */
5690
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005691PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692"S.title() -> unicode\n\
5693\n\
5694Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005695characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696
5697static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005698unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 return fixup(self, fixtitle);
5701}
5702
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005703PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704"S.capitalize() -> unicode\n\
5705\n\
5706Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005707have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708
5709static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005710unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 return fixup(self, fixcapitalize);
5713}
5714
5715#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005716PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717"S.capwords() -> unicode\n\
5718\n\
5719Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005720normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721
5722static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005723unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724{
5725 PyObject *list;
5726 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005727 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729 /* Split into words */
5730 list = split(self, NULL, -1);
5731 if (!list)
5732 return NULL;
5733
5734 /* Capitalize each word */
5735 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5736 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5737 fixcapitalize);
5738 if (item == NULL)
5739 goto onError;
5740 Py_DECREF(PyList_GET_ITEM(list, i));
5741 PyList_SET_ITEM(list, i, item);
5742 }
5743
5744 /* Join the words to form a new string */
5745 item = PyUnicode_Join(NULL, list);
5746
5747onError:
5748 Py_DECREF(list);
5749 return (PyObject *)item;
5750}
5751#endif
5752
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005753/* Argument converter. Coerces to a single unicode character */
5754
5755static int
5756convert_uc(PyObject *obj, void *addr)
5757{
5758 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5759 PyObject *uniobj;
5760 Py_UNICODE *unistr;
5761
5762 uniobj = PyUnicode_FromObject(obj);
5763 if (uniobj == NULL) {
5764 PyErr_SetString(PyExc_TypeError,
5765 "The fill character cannot be converted to Unicode");
5766 return 0;
5767 }
5768 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5769 PyErr_SetString(PyExc_TypeError,
5770 "The fill character must be exactly one character long");
5771 Py_DECREF(uniobj);
5772 return 0;
5773 }
5774 unistr = PyUnicode_AS_UNICODE(uniobj);
5775 *fillcharloc = unistr[0];
5776 Py_DECREF(uniobj);
5777 return 1;
5778}
5779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005780PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005781"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005783Return S centered in a Unicode string of length width. Padding is\n\
5784done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785
5786static PyObject *
5787unicode_center(PyUnicodeObject *self, PyObject *args)
5788{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005789 Py_ssize_t marg, left;
5790 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005791 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792
Thomas Woutersde017742006-02-16 19:34:37 +00005793 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794 return NULL;
5795
Tim Peters7a29bd52001-09-12 03:03:31 +00005796 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 Py_INCREF(self);
5798 return (PyObject*) self;
5799 }
5800
5801 marg = width - self->length;
5802 left = marg / 2 + (marg & width & 1);
5803
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005804 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805}
5806
Marc-André Lemburge5034372000-08-08 08:04:29 +00005807#if 0
5808
5809/* This code should go into some future Unicode collation support
5810 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005811 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005812
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005813/* speedy UTF-16 code point order comparison */
5814/* gleaned from: */
5815/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5816
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005817static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005818{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005819 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005820 0, 0, 0, 0, 0, 0, 0, 0,
5821 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005822 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005823};
5824
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825static int
5826unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5827{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005828 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005829
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830 Py_UNICODE *s1 = str1->str;
5831 Py_UNICODE *s2 = str2->str;
5832
5833 len1 = str1->length;
5834 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005835
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005837 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005838
5839 c1 = *s1++;
5840 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005841
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005842 if (c1 > (1<<11) * 26)
5843 c1 += utf16Fixup[c1>>11];
5844 if (c2 > (1<<11) * 26)
5845 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005846 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005847
5848 if (c1 != c2)
5849 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005850
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005851 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852 }
5853
5854 return (len1 < len2) ? -1 : (len1 != len2);
5855}
5856
Marc-André Lemburge5034372000-08-08 08:04:29 +00005857#else
5858
5859static int
5860unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5861{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005862 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005863
5864 Py_UNICODE *s1 = str1->str;
5865 Py_UNICODE *s2 = str2->str;
5866
5867 len1 = str1->length;
5868 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005869
Marc-André Lemburge5034372000-08-08 08:04:29 +00005870 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005871 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005872
Fredrik Lundh45714e92001-06-26 16:39:36 +00005873 c1 = *s1++;
5874 c2 = *s2++;
5875
5876 if (c1 != c2)
5877 return (c1 < c2) ? -1 : 1;
5878
Marc-André Lemburge5034372000-08-08 08:04:29 +00005879 len1--; len2--;
5880 }
5881
5882 return (len1 < len2) ? -1 : (len1 != len2);
5883}
5884
5885#endif
5886
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887int PyUnicode_Compare(PyObject *left,
5888 PyObject *right)
5889{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005890 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5891 return unicode_compare((PyUnicodeObject *)left,
5892 (PyUnicodeObject *)right);
5893 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5894 (PyUnicode_Check(left) && PyString_Check(right))) {
5895 if (PyUnicode_Check(left))
5896 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5897 if (PyUnicode_Check(right))
5898 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5899 assert(PyString_Check(left));
5900 assert(PyString_Check(right));
5901 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005903 PyErr_Format(PyExc_TypeError,
5904 "Can't compare %.100s and %.100s",
5905 left->ob_type->tp_name,
5906 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 return -1;
5908}
5909
Martin v. Löwis5b222132007-06-10 09:51:05 +00005910int
5911PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
5912{
5913 int i;
5914 Py_UNICODE *id;
5915 assert(PyUnicode_Check(uni));
5916 id = PyUnicode_AS_UNICODE(uni);
5917 /* Compare Unicode string and source character set string */
5918 for (i = 0; id[i] && str[i]; i++)
5919 if (id[i] != str[i])
5920 return ((int)id[i] < (int)str[i]) ? -1 : 1;
5921 if (id[i])
5922 return 1; /* uni is longer */
5923 if (str[i])
5924 return -1; /* str is longer */
5925 return 0;
5926}
5927
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005928PyObject *PyUnicode_RichCompare(PyObject *left,
5929 PyObject *right,
5930 int op)
5931{
5932 int result;
5933
5934 result = PyUnicode_Compare(left, right);
5935 if (result == -1 && PyErr_Occurred())
5936 goto onError;
5937
5938 /* Convert the return value to a Boolean */
5939 switch (op) {
5940 case Py_EQ:
5941 result = (result == 0);
5942 break;
5943 case Py_NE:
5944 result = (result != 0);
5945 break;
5946 case Py_LE:
5947 result = (result <= 0);
5948 break;
5949 case Py_GE:
5950 result = (result >= 0);
5951 break;
5952 case Py_LT:
5953 result = (result == -1);
5954 break;
5955 case Py_GT:
5956 result = (result == 1);
5957 break;
5958 }
5959 return PyBool_FromLong(result);
5960
5961 onError:
5962
5963 /* Standard case
5964
5965 Type errors mean that PyUnicode_FromObject() could not convert
5966 one of the arguments (usually the right hand side) to Unicode,
5967 ie. we can't handle the comparison request. However, it is
5968 possible that the other object knows a comparison method, which
5969 is why we return Py_NotImplemented to give the other object a
5970 chance.
5971
5972 */
5973 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5974 PyErr_Clear();
5975 Py_INCREF(Py_NotImplemented);
5976 return Py_NotImplemented;
5977 }
5978 if (op != Py_EQ && op != Py_NE)
5979 return NULL;
5980
5981 /* Equality comparison.
5982
5983 This is a special case: we silence any PyExc_UnicodeDecodeError
5984 and instead turn it into a PyErr_UnicodeWarning.
5985
5986 */
5987 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5988 return NULL;
5989 PyErr_Clear();
5990 if (PyErr_Warn(PyExc_UnicodeWarning,
5991 (op == Py_EQ) ?
5992 "Unicode equal comparison "
5993 "failed to convert both arguments to Unicode - "
5994 "interpreting them as being unequal" :
5995 "Unicode unequal comparison "
5996 "failed to convert both arguments to Unicode - "
5997 "interpreting them as being unequal"
5998 ) < 0)
5999 return NULL;
6000 result = (op == Py_NE);
6001 return PyBool_FromLong(result);
6002}
6003
Guido van Rossum403d68b2000-03-13 15:55:09 +00006004int PyUnicode_Contains(PyObject *container,
6005 PyObject *element)
6006{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006007 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006008 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006009
6010 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006011 sub = PyUnicode_FromObject(element);
6012 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006013 PyErr_Format(PyExc_TypeError,
6014 "'in <string>' requires string as left operand, not %s",
6015 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006016 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006017 }
6018
Thomas Wouters477c8d52006-05-27 19:21:47 +00006019 str = PyUnicode_FromObject(container);
6020 if (!str) {
6021 Py_DECREF(sub);
6022 return -1;
6023 }
6024
6025 result = stringlib_contains_obj(str, sub);
6026
6027 Py_DECREF(str);
6028 Py_DECREF(sub);
6029
Guido van Rossum403d68b2000-03-13 15:55:09 +00006030 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006031}
6032
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033/* Concat to string or Unicode object giving a new Unicode object. */
6034
6035PyObject *PyUnicode_Concat(PyObject *left,
6036 PyObject *right)
6037{
6038 PyUnicodeObject *u = NULL, *v = NULL, *w;
6039
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006040 if (PyBytes_Check(left) || PyBytes_Check(right))
6041 return PyBytes_Concat(left, right);
6042
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043 /* Coerce the two arguments */
6044 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6045 if (u == NULL)
6046 goto onError;
6047 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6048 if (v == NULL)
6049 goto onError;
6050
6051 /* Shortcuts */
6052 if (v == unicode_empty) {
6053 Py_DECREF(v);
6054 return (PyObject *)u;
6055 }
6056 if (u == unicode_empty) {
6057 Py_DECREF(u);
6058 return (PyObject *)v;
6059 }
6060
6061 /* Concat the two Unicode strings */
6062 w = _PyUnicode_New(u->length + v->length);
6063 if (w == NULL)
6064 goto onError;
6065 Py_UNICODE_COPY(w->str, u->str, u->length);
6066 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6067
6068 Py_DECREF(u);
6069 Py_DECREF(v);
6070 return (PyObject *)w;
6071
6072onError:
6073 Py_XDECREF(u);
6074 Py_XDECREF(v);
6075 return NULL;
6076}
6077
Walter Dörwald1ab83302007-05-18 17:15:44 +00006078void
6079PyUnicode_Append(PyObject **pleft, PyObject *right)
6080{
6081 PyObject *new;
6082 if (*pleft == NULL)
6083 return;
6084 if (right == NULL || !PyUnicode_Check(*pleft)) {
6085 Py_DECREF(*pleft);
6086 *pleft = NULL;
6087 return;
6088 }
6089 new = PyUnicode_Concat(*pleft, right);
6090 Py_DECREF(*pleft);
6091 *pleft = new;
6092}
6093
6094void
6095PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6096{
6097 PyUnicode_Append(pleft, right);
6098 Py_XDECREF(right);
6099}
6100
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006101PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102"S.count(sub[, start[, end]]) -> int\n\
6103\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006104Return the number of non-overlapping occurrences of substring sub in\n\
6105Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006106interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107
6108static PyObject *
6109unicode_count(PyUnicodeObject *self, PyObject *args)
6110{
6111 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006112 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006113 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 PyObject *result;
6115
Guido van Rossumb8872e62000-05-09 14:14:27 +00006116 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6117 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 return NULL;
6119
6120 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006121 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 if (substring == NULL)
6123 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006124
Thomas Wouters477c8d52006-05-27 19:21:47 +00006125 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126
Thomas Wouters477c8d52006-05-27 19:21:47 +00006127 result = PyInt_FromSsize_t(
6128 stringlib_count(self->str + start, end - start,
6129 substring->str, substring->length)
6130 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131
6132 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006133
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 return result;
6135}
6136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006137PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006138"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006140Encodes S using the codec registered for encoding. encoding defaults\n\
6141to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006142handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006143a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6144'xmlcharrefreplace' as well as any other name registered with\n\
6145codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146
6147static PyObject *
6148unicode_encode(PyUnicodeObject *self, PyObject *args)
6149{
6150 char *encoding = NULL;
6151 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006152 PyObject *v;
6153
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6155 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006156 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006157 if (v == NULL)
6158 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006159 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006160 if (PyString_Check(v)) {
6161 /* Old codec, turn it into bytes */
6162 PyObject *b = PyBytes_FromObject(v);
6163 Py_DECREF(v);
6164 return b;
6165 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006166 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006167 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006168 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006169 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006170 Py_DECREF(v);
6171 return NULL;
6172 }
6173 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006174
6175 onError:
6176 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006177}
6178
6179PyDoc_STRVAR(decode__doc__,
6180"S.decode([encoding[,errors]]) -> string or unicode\n\
6181\n\
6182Decodes S using the codec registered for encoding. encoding defaults\n\
6183to the default encoding. errors may be given to set a different error\n\
6184handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6185a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6186as well as any other name registerd with codecs.register_error that is\n\
6187able to handle UnicodeDecodeErrors.");
6188
6189static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006190unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006191{
6192 char *encoding = NULL;
6193 char *errors = NULL;
6194 PyObject *v;
6195
6196 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6197 return NULL;
6198 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006199 if (v == NULL)
6200 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006201 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6202 PyErr_Format(PyExc_TypeError,
6203 "decoder did not return a string/unicode object "
6204 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006205 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006206 Py_DECREF(v);
6207 return NULL;
6208 }
6209 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006210
6211 onError:
6212 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213}
6214
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006215PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216"S.expandtabs([tabsize]) -> unicode\n\
6217\n\
6218Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006219If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220
6221static PyObject*
6222unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6223{
6224 Py_UNICODE *e;
6225 Py_UNICODE *p;
6226 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006227 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 PyUnicodeObject *u;
6229 int tabsize = 8;
6230
6231 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6232 return NULL;
6233
Thomas Wouters7e474022000-07-16 12:04:32 +00006234 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006235 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 e = self->str + self->length;
6237 for (p = self->str; p < e; p++)
6238 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006239 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006241 if (old_j > j) {
6242 PyErr_SetString(PyExc_OverflowError,
6243 "new string is too long");
6244 return NULL;
6245 }
6246 old_j = j;
6247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248 }
6249 else {
6250 j++;
6251 if (*p == '\n' || *p == '\r') {
6252 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006253 old_j = j = 0;
6254 if (i < 0) {
6255 PyErr_SetString(PyExc_OverflowError,
6256 "new string is too long");
6257 return NULL;
6258 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259 }
6260 }
6261
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006262 if ((i + j) < 0) {
6263 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6264 return NULL;
6265 }
6266
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267 /* Second pass: create output string and fill it */
6268 u = _PyUnicode_New(i + j);
6269 if (!u)
6270 return NULL;
6271
6272 j = 0;
6273 q = u->str;
6274
6275 for (p = self->str; p < e; p++)
6276 if (*p == '\t') {
6277 if (tabsize > 0) {
6278 i = tabsize - (j % tabsize);
6279 j += i;
6280 while (i--)
6281 *q++ = ' ';
6282 }
6283 }
6284 else {
6285 j++;
6286 *q++ = *p;
6287 if (*p == '\n' || *p == '\r')
6288 j = 0;
6289 }
6290
6291 return (PyObject*) u;
6292}
6293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006294PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295"S.find(sub [,start [,end]]) -> int\n\
6296\n\
6297Return the lowest index in S where substring sub is found,\n\
6298such that sub is contained within s[start,end]. Optional\n\
6299arguments start and end are interpreted as in slice notation.\n\
6300\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006301Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302
6303static PyObject *
6304unicode_find(PyUnicodeObject *self, PyObject *args)
6305{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006306 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006307 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006308 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006309 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310
Guido van Rossumb8872e62000-05-09 14:14:27 +00006311 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6312 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006314 substring = PyUnicode_FromObject(substring);
6315 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 return NULL;
6317
Thomas Wouters477c8d52006-05-27 19:21:47 +00006318 result = stringlib_find_slice(
6319 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6320 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6321 start, end
6322 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323
6324 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006325
6326 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327}
6328
6329static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006330unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331{
6332 if (index < 0 || index >= self->length) {
6333 PyErr_SetString(PyExc_IndexError, "string index out of range");
6334 return NULL;
6335 }
6336
6337 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6338}
6339
6340static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006341unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006343 /* Since Unicode objects compare equal to their UTF-8 string
6344 counterparts, we hash the UTF-8 string. */
6345 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6346 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347}
6348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006349PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350"S.index(sub [,start [,end]]) -> int\n\
6351\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006352Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353
6354static PyObject *
6355unicode_index(PyUnicodeObject *self, PyObject *args)
6356{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006357 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006358 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006359 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006360 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361
Guido van Rossumb8872e62000-05-09 14:14:27 +00006362 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6363 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006365 substring = PyUnicode_FromObject(substring);
6366 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367 return NULL;
6368
Thomas Wouters477c8d52006-05-27 19:21:47 +00006369 result = stringlib_find_slice(
6370 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6371 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6372 start, end
6373 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374
6375 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006376
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377 if (result < 0) {
6378 PyErr_SetString(PyExc_ValueError, "substring not found");
6379 return NULL;
6380 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006381
Martin v. Löwis18e16552006-02-15 17:27:45 +00006382 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383}
6384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006385PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006386"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006388Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006389at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390
6391static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006392unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393{
6394 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6395 register const Py_UNICODE *e;
6396 int cased;
6397
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398 /* Shortcut for single character strings */
6399 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006400 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006402 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006403 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006404 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006405
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 e = p + PyUnicode_GET_SIZE(self);
6407 cased = 0;
6408 for (; p < e; p++) {
6409 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006410
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006412 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 else if (!cased && Py_UNICODE_ISLOWER(ch))
6414 cased = 1;
6415 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006416 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417}
6418
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006419PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006420"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006422Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006423at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424
6425static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006426unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427{
6428 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6429 register const Py_UNICODE *e;
6430 int cased;
6431
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 /* Shortcut for single character strings */
6433 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006434 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006436 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006437 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006438 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006439
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 e = p + PyUnicode_GET_SIZE(self);
6441 cased = 0;
6442 for (; p < e; p++) {
6443 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006444
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006446 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 else if (!cased && Py_UNICODE_ISUPPER(ch))
6448 cased = 1;
6449 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006450 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451}
6452
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006453PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006454"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006456Return True if S is a titlecased string and there is at least one\n\
6457character in S, i.e. upper- and titlecase characters may only\n\
6458follow uncased characters and lowercase characters only cased ones.\n\
6459Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460
6461static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006462unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463{
6464 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6465 register const Py_UNICODE *e;
6466 int cased, previous_is_cased;
6467
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468 /* Shortcut for single character strings */
6469 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006470 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6471 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006473 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006474 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006475 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006476
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477 e = p + PyUnicode_GET_SIZE(self);
6478 cased = 0;
6479 previous_is_cased = 0;
6480 for (; p < e; p++) {
6481 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006482
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6484 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006485 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 previous_is_cased = 1;
6487 cased = 1;
6488 }
6489 else if (Py_UNICODE_ISLOWER(ch)) {
6490 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006491 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492 previous_is_cased = 1;
6493 cased = 1;
6494 }
6495 else
6496 previous_is_cased = 0;
6497 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006498 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499}
6500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006501PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006502"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006504Return True if all characters in S are whitespace\n\
6505and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506
6507static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006508unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509{
6510 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6511 register const Py_UNICODE *e;
6512
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513 /* Shortcut for single character strings */
6514 if (PyUnicode_GET_SIZE(self) == 1 &&
6515 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006516 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006518 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006519 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006520 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006521
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 e = p + PyUnicode_GET_SIZE(self);
6523 for (; p < e; p++) {
6524 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006525 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006527 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528}
6529
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006530PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006531"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006532\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006533Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006534and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006535
6536static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006537unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006538{
6539 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6540 register const Py_UNICODE *e;
6541
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006542 /* Shortcut for single character strings */
6543 if (PyUnicode_GET_SIZE(self) == 1 &&
6544 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006545 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006546
6547 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006548 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006549 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006550
6551 e = p + PyUnicode_GET_SIZE(self);
6552 for (; p < e; p++) {
6553 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006554 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006555 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006556 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006557}
6558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006559PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006560"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006561\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006562Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006563and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006564
6565static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006566unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006567{
6568 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6569 register const Py_UNICODE *e;
6570
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006571 /* Shortcut for single character strings */
6572 if (PyUnicode_GET_SIZE(self) == 1 &&
6573 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006574 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006575
6576 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006577 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006578 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006579
6580 e = p + PyUnicode_GET_SIZE(self);
6581 for (; p < e; p++) {
6582 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006583 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006584 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006585 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006586}
6587
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006588PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006589"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006591Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006592False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593
6594static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006595unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596{
6597 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6598 register const Py_UNICODE *e;
6599
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 /* Shortcut for single character strings */
6601 if (PyUnicode_GET_SIZE(self) == 1 &&
6602 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006603 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006605 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006606 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006607 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006608
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 e = p + PyUnicode_GET_SIZE(self);
6610 for (; p < e; p++) {
6611 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006612 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006614 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615}
6616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006617PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006618"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006620Return True if all characters in S are digits\n\
6621and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622
6623static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006624unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625{
6626 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6627 register const Py_UNICODE *e;
6628
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 /* Shortcut for single character strings */
6630 if (PyUnicode_GET_SIZE(self) == 1 &&
6631 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006632 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006634 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006635 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006636 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006637
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638 e = p + PyUnicode_GET_SIZE(self);
6639 for (; p < e; p++) {
6640 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006641 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006643 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644}
6645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006646PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006647"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006649Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006650False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651
6652static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006653unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654{
6655 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6656 register const Py_UNICODE *e;
6657
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658 /* Shortcut for single character strings */
6659 if (PyUnicode_GET_SIZE(self) == 1 &&
6660 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006661 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006663 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006664 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006665 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006666
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 e = p + PyUnicode_GET_SIZE(self);
6668 for (; p < e; p++) {
6669 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006670 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006672 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673}
6674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006675PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676"S.join(sequence) -> unicode\n\
6677\n\
6678Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006679sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680
6681static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006682unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006684 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685}
6686
Martin v. Löwis18e16552006-02-15 17:27:45 +00006687static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688unicode_length(PyUnicodeObject *self)
6689{
6690 return self->length;
6691}
6692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006693PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006694"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695\n\
6696Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006697done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698
6699static PyObject *
6700unicode_ljust(PyUnicodeObject *self, PyObject *args)
6701{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006702 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006703 Py_UNICODE fillchar = ' ';
6704
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006705 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 return NULL;
6707
Tim Peters7a29bd52001-09-12 03:03:31 +00006708 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709 Py_INCREF(self);
6710 return (PyObject*) self;
6711 }
6712
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006713 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714}
6715
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006716PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717"S.lower() -> unicode\n\
6718\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006719Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720
6721static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006722unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 return fixup(self, fixlower);
6725}
6726
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006727#define LEFTSTRIP 0
6728#define RIGHTSTRIP 1
6729#define BOTHSTRIP 2
6730
6731/* Arrays indexed by above */
6732static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6733
6734#define STRIPNAME(i) (stripformat[i]+3)
6735
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006736/* externally visible for str.strip(unicode) */
6737PyObject *
6738_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6739{
6740 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006741 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006742 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006743 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6744 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006745
Thomas Wouters477c8d52006-05-27 19:21:47 +00006746 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6747
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006748 i = 0;
6749 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006750 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6751 i++;
6752 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006753 }
6754
6755 j = len;
6756 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006757 do {
6758 j--;
6759 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6760 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006761 }
6762
6763 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006764 Py_INCREF(self);
6765 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006766 }
6767 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006768 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006769}
6770
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771
6772static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006773do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006775 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006776 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006777
6778 i = 0;
6779 if (striptype != RIGHTSTRIP) {
6780 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6781 i++;
6782 }
6783 }
6784
6785 j = len;
6786 if (striptype != LEFTSTRIP) {
6787 do {
6788 j--;
6789 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6790 j++;
6791 }
6792
6793 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6794 Py_INCREF(self);
6795 return (PyObject*)self;
6796 }
6797 else
6798 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799}
6800
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006801
6802static PyObject *
6803do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6804{
6805 PyObject *sep = NULL;
6806
6807 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6808 return NULL;
6809
6810 if (sep != NULL && sep != Py_None) {
6811 if (PyUnicode_Check(sep))
6812 return _PyUnicode_XStrip(self, striptype, sep);
6813 else if (PyString_Check(sep)) {
6814 PyObject *res;
6815 sep = PyUnicode_FromObject(sep);
6816 if (sep==NULL)
6817 return NULL;
6818 res = _PyUnicode_XStrip(self, striptype, sep);
6819 Py_DECREF(sep);
6820 return res;
6821 }
6822 else {
6823 PyErr_Format(PyExc_TypeError,
6824 "%s arg must be None, unicode or str",
6825 STRIPNAME(striptype));
6826 return NULL;
6827 }
6828 }
6829
6830 return do_strip(self, striptype);
6831}
6832
6833
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006834PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006835"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006836\n\
6837Return a copy of the string S with leading and trailing\n\
6838whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006839If chars is given and not None, remove characters in chars instead.\n\
6840If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006841
6842static PyObject *
6843unicode_strip(PyUnicodeObject *self, PyObject *args)
6844{
6845 if (PyTuple_GET_SIZE(args) == 0)
6846 return do_strip(self, BOTHSTRIP); /* Common case */
6847 else
6848 return do_argstrip(self, BOTHSTRIP, args);
6849}
6850
6851
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006852PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006853"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006854\n\
6855Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006856If chars is given and not None, remove characters in chars instead.\n\
6857If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006858
6859static PyObject *
6860unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6861{
6862 if (PyTuple_GET_SIZE(args) == 0)
6863 return do_strip(self, LEFTSTRIP); /* Common case */
6864 else
6865 return do_argstrip(self, LEFTSTRIP, args);
6866}
6867
6868
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006869PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006870"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006871\n\
6872Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006873If chars is given and not None, remove characters in chars instead.\n\
6874If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006875
6876static PyObject *
6877unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6878{
6879 if (PyTuple_GET_SIZE(args) == 0)
6880 return do_strip(self, RIGHTSTRIP); /* Common case */
6881 else
6882 return do_argstrip(self, RIGHTSTRIP, args);
6883}
6884
6885
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006887unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888{
6889 PyUnicodeObject *u;
6890 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006891 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006892 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893
6894 if (len < 0)
6895 len = 0;
6896
Tim Peters7a29bd52001-09-12 03:03:31 +00006897 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 /* no repeat, return original string */
6899 Py_INCREF(str);
6900 return (PyObject*) str;
6901 }
Tim Peters8f422462000-09-09 06:13:41 +00006902
6903 /* ensure # of chars needed doesn't overflow int and # of bytes
6904 * needed doesn't overflow size_t
6905 */
6906 nchars = len * str->length;
6907 if (len && nchars / len != str->length) {
6908 PyErr_SetString(PyExc_OverflowError,
6909 "repeated string is too long");
6910 return NULL;
6911 }
6912 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6913 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6914 PyErr_SetString(PyExc_OverflowError,
6915 "repeated string is too long");
6916 return NULL;
6917 }
6918 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919 if (!u)
6920 return NULL;
6921
6922 p = u->str;
6923
Thomas Wouters477c8d52006-05-27 19:21:47 +00006924 if (str->length == 1 && len > 0) {
6925 Py_UNICODE_FILL(p, str->str[0], len);
6926 } else {
6927 Py_ssize_t done = 0; /* number of characters copied this far */
6928 if (done < nchars) {
6929 Py_UNICODE_COPY(p, str->str, str->length);
6930 done = str->length;
6931 }
6932 while (done < nchars) {
6933 int n = (done <= nchars-done) ? done : nchars-done;
6934 Py_UNICODE_COPY(p+done, p, n);
6935 done += n;
6936 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 }
6938
6939 return (PyObject*) u;
6940}
6941
6942PyObject *PyUnicode_Replace(PyObject *obj,
6943 PyObject *subobj,
6944 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006945 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946{
6947 PyObject *self;
6948 PyObject *str1;
6949 PyObject *str2;
6950 PyObject *result;
6951
6952 self = PyUnicode_FromObject(obj);
6953 if (self == NULL)
6954 return NULL;
6955 str1 = PyUnicode_FromObject(subobj);
6956 if (str1 == NULL) {
6957 Py_DECREF(self);
6958 return NULL;
6959 }
6960 str2 = PyUnicode_FromObject(replobj);
6961 if (str2 == NULL) {
6962 Py_DECREF(self);
6963 Py_DECREF(str1);
6964 return NULL;
6965 }
Tim Petersced69f82003-09-16 20:30:58 +00006966 result = replace((PyUnicodeObject *)self,
6967 (PyUnicodeObject *)str1,
6968 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969 maxcount);
6970 Py_DECREF(self);
6971 Py_DECREF(str1);
6972 Py_DECREF(str2);
6973 return result;
6974}
6975
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006976PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977"S.replace (old, new[, maxsplit]) -> unicode\n\
6978\n\
6979Return a copy of S with all occurrences of substring\n\
6980old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006981given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982
6983static PyObject*
6984unicode_replace(PyUnicodeObject *self, PyObject *args)
6985{
6986 PyUnicodeObject *str1;
6987 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006988 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989 PyObject *result;
6990
Martin v. Löwis18e16552006-02-15 17:27:45 +00006991 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992 return NULL;
6993 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6994 if (str1 == NULL)
6995 return NULL;
6996 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006997 if (str2 == NULL) {
6998 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007000 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001
7002 result = replace(self, str1, str2, maxcount);
7003
7004 Py_DECREF(str1);
7005 Py_DECREF(str2);
7006 return result;
7007}
7008
7009static
7010PyObject *unicode_repr(PyObject *unicode)
7011{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007012 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007013 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007014 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7015 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7016
7017 /* XXX(nnorwitz): rather than over-allocating, it would be
7018 better to choose a different scheme. Perhaps scan the
7019 first N-chars of the string and allocate based on that size.
7020 */
7021 /* Initial allocation is based on the longest-possible unichr
7022 escape.
7023
7024 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7025 unichr, so in this case it's the longest unichr escape. In
7026 narrow (UTF-16) builds this is five chars per source unichr
7027 since there are two unichrs in the surrogate pair, so in narrow
7028 (UTF-16) builds it's not the longest unichr escape.
7029
7030 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7031 so in the narrow (UTF-16) build case it's the longest unichr
7032 escape.
7033 */
7034
Walter Dörwald1ab83302007-05-18 17:15:44 +00007035 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007036 2 /* quotes */
7037#ifdef Py_UNICODE_WIDE
7038 + 10*size
7039#else
7040 + 6*size
7041#endif
7042 + 1);
7043 if (repr == NULL)
7044 return NULL;
7045
Walter Dörwald1ab83302007-05-18 17:15:44 +00007046 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007047
7048 /* Add quote */
7049 *p++ = (findchar(s, size, '\'') &&
7050 !findchar(s, size, '"')) ? '"' : '\'';
7051 while (size-- > 0) {
7052 Py_UNICODE ch = *s++;
7053
7054 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007055 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007056 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007057 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007058 continue;
7059 }
7060
7061#ifdef Py_UNICODE_WIDE
7062 /* Map 21-bit characters to '\U00xxxxxx' */
7063 else if (ch >= 0x10000) {
7064 *p++ = '\\';
7065 *p++ = 'U';
7066 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7067 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7068 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7069 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7070 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7071 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7072 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7073 *p++ = hexdigits[ch & 0x0000000F];
7074 continue;
7075 }
7076#else
7077 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7078 else if (ch >= 0xD800 && ch < 0xDC00) {
7079 Py_UNICODE ch2;
7080 Py_UCS4 ucs;
7081
7082 ch2 = *s++;
7083 size--;
7084 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7085 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7086 *p++ = '\\';
7087 *p++ = 'U';
7088 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7089 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7090 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7091 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7092 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7093 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7094 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7095 *p++ = hexdigits[ucs & 0x0000000F];
7096 continue;
7097 }
7098 /* Fall through: isolated surrogates are copied as-is */
7099 s--;
7100 size++;
7101 }
7102#endif
7103
7104 /* Map 16-bit characters to '\uxxxx' */
7105 if (ch >= 256) {
7106 *p++ = '\\';
7107 *p++ = 'u';
7108 *p++ = hexdigits[(ch >> 12) & 0x000F];
7109 *p++ = hexdigits[(ch >> 8) & 0x000F];
7110 *p++ = hexdigits[(ch >> 4) & 0x000F];
7111 *p++ = hexdigits[ch & 0x000F];
7112 }
7113
7114 /* Map special whitespace to '\t', \n', '\r' */
7115 else if (ch == '\t') {
7116 *p++ = '\\';
7117 *p++ = 't';
7118 }
7119 else if (ch == '\n') {
7120 *p++ = '\\';
7121 *p++ = 'n';
7122 }
7123 else if (ch == '\r') {
7124 *p++ = '\\';
7125 *p++ = 'r';
7126 }
7127
7128 /* Map non-printable US ASCII to '\xhh' */
7129 else if (ch < ' ' || ch >= 0x7F) {
7130 *p++ = '\\';
7131 *p++ = 'x';
7132 *p++ = hexdigits[(ch >> 4) & 0x000F];
7133 *p++ = hexdigits[ch & 0x000F];
7134 }
7135
7136 /* Copy everything else as-is */
7137 else
7138 *p++ = (char) ch;
7139 }
7140 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007141 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007142
7143 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007144 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007145 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146}
7147
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007148PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149"S.rfind(sub [,start [,end]]) -> int\n\
7150\n\
7151Return the highest index in S where substring sub is found,\n\
7152such that sub is contained within s[start,end]. Optional\n\
7153arguments start and end are interpreted as in slice notation.\n\
7154\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007155Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156
7157static PyObject *
7158unicode_rfind(PyUnicodeObject *self, PyObject *args)
7159{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007160 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007161 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007162 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007163 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164
Guido van Rossumb8872e62000-05-09 14:14:27 +00007165 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7166 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007168 substring = PyUnicode_FromObject(substring);
7169 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170 return NULL;
7171
Thomas Wouters477c8d52006-05-27 19:21:47 +00007172 result = stringlib_rfind_slice(
7173 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7174 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7175 start, end
7176 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177
7178 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007179
7180 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181}
7182
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007183PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184"S.rindex(sub [,start [,end]]) -> int\n\
7185\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007186Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187
7188static PyObject *
7189unicode_rindex(PyUnicodeObject *self, PyObject *args)
7190{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007191 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007192 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007193 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007194 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195
Guido van Rossumb8872e62000-05-09 14:14:27 +00007196 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7197 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007199 substring = PyUnicode_FromObject(substring);
7200 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201 return NULL;
7202
Thomas Wouters477c8d52006-05-27 19:21:47 +00007203 result = stringlib_rfind_slice(
7204 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7205 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7206 start, end
7207 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208
7209 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007210
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211 if (result < 0) {
7212 PyErr_SetString(PyExc_ValueError, "substring not found");
7213 return NULL;
7214 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007215 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216}
7217
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007218PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007219"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220\n\
7221Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007222done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223
7224static PyObject *
7225unicode_rjust(PyUnicodeObject *self, PyObject *args)
7226{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007227 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007228 Py_UNICODE fillchar = ' ';
7229
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007230 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231 return NULL;
7232
Tim Peters7a29bd52001-09-12 03:03:31 +00007233 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234 Py_INCREF(self);
7235 return (PyObject*) self;
7236 }
7237
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007238 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239}
7240
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007242unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243{
7244 /* standard clamping */
7245 if (start < 0)
7246 start = 0;
7247 if (end < 0)
7248 end = 0;
7249 if (end > self->length)
7250 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007251 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252 /* full slice, return original string */
7253 Py_INCREF(self);
7254 return (PyObject*) self;
7255 }
7256 if (start > end)
7257 start = end;
7258 /* copy slice */
7259 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7260 end - start);
7261}
7262
7263PyObject *PyUnicode_Split(PyObject *s,
7264 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007265 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266{
7267 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007268
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269 s = PyUnicode_FromObject(s);
7270 if (s == NULL)
7271 return NULL;
7272 if (sep != NULL) {
7273 sep = PyUnicode_FromObject(sep);
7274 if (sep == NULL) {
7275 Py_DECREF(s);
7276 return NULL;
7277 }
7278 }
7279
7280 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7281
7282 Py_DECREF(s);
7283 Py_XDECREF(sep);
7284 return result;
7285}
7286
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007287PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288"S.split([sep [,maxsplit]]) -> list of strings\n\
7289\n\
7290Return a list of the words in S, using sep as the\n\
7291delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007292splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007293any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294
7295static PyObject*
7296unicode_split(PyUnicodeObject *self, PyObject *args)
7297{
7298 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007299 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300
Martin v. Löwis18e16552006-02-15 17:27:45 +00007301 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 return NULL;
7303
7304 if (substring == Py_None)
7305 return split(self, NULL, maxcount);
7306 else if (PyUnicode_Check(substring))
7307 return split(self, (PyUnicodeObject *)substring, maxcount);
7308 else
7309 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7310}
7311
Thomas Wouters477c8d52006-05-27 19:21:47 +00007312PyObject *
7313PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7314{
7315 PyObject* str_obj;
7316 PyObject* sep_obj;
7317 PyObject* out;
7318
7319 str_obj = PyUnicode_FromObject(str_in);
7320 if (!str_obj)
7321 return NULL;
7322 sep_obj = PyUnicode_FromObject(sep_in);
7323 if (!sep_obj) {
7324 Py_DECREF(str_obj);
7325 return NULL;
7326 }
7327
7328 out = stringlib_partition(
7329 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7330 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7331 );
7332
7333 Py_DECREF(sep_obj);
7334 Py_DECREF(str_obj);
7335
7336 return out;
7337}
7338
7339
7340PyObject *
7341PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7342{
7343 PyObject* str_obj;
7344 PyObject* sep_obj;
7345 PyObject* out;
7346
7347 str_obj = PyUnicode_FromObject(str_in);
7348 if (!str_obj)
7349 return NULL;
7350 sep_obj = PyUnicode_FromObject(sep_in);
7351 if (!sep_obj) {
7352 Py_DECREF(str_obj);
7353 return NULL;
7354 }
7355
7356 out = stringlib_rpartition(
7357 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7358 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7359 );
7360
7361 Py_DECREF(sep_obj);
7362 Py_DECREF(str_obj);
7363
7364 return out;
7365}
7366
7367PyDoc_STRVAR(partition__doc__,
7368"S.partition(sep) -> (head, sep, tail)\n\
7369\n\
7370Searches for the separator sep in S, and returns the part before it,\n\
7371the separator itself, and the part after it. If the separator is not\n\
7372found, returns S and two empty strings.");
7373
7374static PyObject*
7375unicode_partition(PyUnicodeObject *self, PyObject *separator)
7376{
7377 return PyUnicode_Partition((PyObject *)self, separator);
7378}
7379
7380PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007381"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007382\n\
7383Searches for the separator sep in S, starting at the end of S, and returns\n\
7384the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007385separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007386
7387static PyObject*
7388unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7389{
7390 return PyUnicode_RPartition((PyObject *)self, separator);
7391}
7392
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007393PyObject *PyUnicode_RSplit(PyObject *s,
7394 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007395 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007396{
7397 PyObject *result;
7398
7399 s = PyUnicode_FromObject(s);
7400 if (s == NULL)
7401 return NULL;
7402 if (sep != NULL) {
7403 sep = PyUnicode_FromObject(sep);
7404 if (sep == NULL) {
7405 Py_DECREF(s);
7406 return NULL;
7407 }
7408 }
7409
7410 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7411
7412 Py_DECREF(s);
7413 Py_XDECREF(sep);
7414 return result;
7415}
7416
7417PyDoc_STRVAR(rsplit__doc__,
7418"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7419\n\
7420Return a list of the words in S, using sep as the\n\
7421delimiter string, starting at the end of the string and\n\
7422working to the front. If maxsplit is given, at most maxsplit\n\
7423splits are done. If sep is not specified, any whitespace string\n\
7424is a separator.");
7425
7426static PyObject*
7427unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7428{
7429 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007430 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007431
Martin v. Löwis18e16552006-02-15 17:27:45 +00007432 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007433 return NULL;
7434
7435 if (substring == Py_None)
7436 return rsplit(self, NULL, maxcount);
7437 else if (PyUnicode_Check(substring))
7438 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7439 else
7440 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7441}
7442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007443PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007444"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445\n\
7446Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007447Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007448is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449
7450static PyObject*
7451unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7452{
Guido van Rossum86662912000-04-11 15:38:46 +00007453 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454
Guido van Rossum86662912000-04-11 15:38:46 +00007455 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456 return NULL;
7457
Guido van Rossum86662912000-04-11 15:38:46 +00007458 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459}
7460
7461static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007462PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463{
Walter Dörwald346737f2007-05-31 10:44:43 +00007464 if (PyUnicode_CheckExact(self)) {
7465 Py_INCREF(self);
7466 return self;
7467 } else
7468 /* Subtype -- return genuine unicode string with the same value. */
7469 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7470 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471}
7472
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007473PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474"S.swapcase() -> unicode\n\
7475\n\
7476Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007477and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478
7479static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007480unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482 return fixup(self, fixswapcase);
7483}
7484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007485PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486"S.translate(table) -> unicode\n\
7487\n\
7488Return a copy of the string S, where all characters have been mapped\n\
7489through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007490Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7491Unmapped characters are left untouched. Characters mapped to None\n\
7492are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493
7494static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007495unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496{
Tim Petersced69f82003-09-16 20:30:58 +00007497 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007499 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500 "ignore");
7501}
7502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007503PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504"S.upper() -> unicode\n\
7505\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007506Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507
7508static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007509unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511 return fixup(self, fixupper);
7512}
7513
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007514PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515"S.zfill(width) -> unicode\n\
7516\n\
7517Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007518of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519
7520static PyObject *
7521unicode_zfill(PyUnicodeObject *self, PyObject *args)
7522{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007523 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524 PyUnicodeObject *u;
7525
Martin v. Löwis18e16552006-02-15 17:27:45 +00007526 Py_ssize_t width;
7527 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528 return NULL;
7529
7530 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007531 if (PyUnicode_CheckExact(self)) {
7532 Py_INCREF(self);
7533 return (PyObject*) self;
7534 }
7535 else
7536 return PyUnicode_FromUnicode(
7537 PyUnicode_AS_UNICODE(self),
7538 PyUnicode_GET_SIZE(self)
7539 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540 }
7541
7542 fill = width - self->length;
7543
7544 u = pad(self, fill, 0, '0');
7545
Walter Dörwald068325e2002-04-15 13:36:47 +00007546 if (u == NULL)
7547 return NULL;
7548
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 if (u->str[fill] == '+' || u->str[fill] == '-') {
7550 /* move sign to beginning of string */
7551 u->str[0] = u->str[fill];
7552 u->str[fill] = '0';
7553 }
7554
7555 return (PyObject*) u;
7556}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557
7558#if 0
7559static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007560unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562 return PyInt_FromLong(unicode_freelist_size);
7563}
7564#endif
7565
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007566PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007567"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007569Return True if S starts with the specified prefix, False otherwise.\n\
7570With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007571With optional end, stop comparing S at that position.\n\
7572prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573
7574static PyObject *
7575unicode_startswith(PyUnicodeObject *self,
7576 PyObject *args)
7577{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007578 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007580 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007581 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007582 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007584 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007585 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007587 if (PyTuple_Check(subobj)) {
7588 Py_ssize_t i;
7589 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7590 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7591 PyTuple_GET_ITEM(subobj, i));
7592 if (substring == NULL)
7593 return NULL;
7594 result = tailmatch(self, substring, start, end, -1);
7595 Py_DECREF(substring);
7596 if (result) {
7597 Py_RETURN_TRUE;
7598 }
7599 }
7600 /* nothing matched */
7601 Py_RETURN_FALSE;
7602 }
7603 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007605 return NULL;
7606 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007608 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609}
7610
7611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007612PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007613"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007615Return True if S ends with the specified suffix, False otherwise.\n\
7616With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007617With optional end, stop comparing S at that position.\n\
7618suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619
7620static PyObject *
7621unicode_endswith(PyUnicodeObject *self,
7622 PyObject *args)
7623{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007624 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007626 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007627 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007628 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007630 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7631 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007633 if (PyTuple_Check(subobj)) {
7634 Py_ssize_t i;
7635 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7636 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7637 PyTuple_GET_ITEM(subobj, i));
7638 if (substring == NULL)
7639 return NULL;
7640 result = tailmatch(self, substring, start, end, +1);
7641 Py_DECREF(substring);
7642 if (result) {
7643 Py_RETURN_TRUE;
7644 }
7645 }
7646 Py_RETURN_FALSE;
7647 }
7648 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007652 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007654 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655}
7656
7657
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007658
7659static PyObject *
7660unicode_getnewargs(PyUnicodeObject *v)
7661{
7662 return Py_BuildValue("(u#)", v->str, v->length);
7663}
7664
7665
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666static PyMethodDef unicode_methods[] = {
7667
7668 /* Order is according to common usage: often used methods should
7669 appear first, since lookup is done sequentially. */
7670
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007671 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7672 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7673 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007674 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007675 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7676 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7677 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7678 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7679 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7680 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7681 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007682 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007683 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7684 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7685 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007686 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007687 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007688/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7689 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7690 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7691 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007692 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007693 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007694 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007695 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007696 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7697 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7698 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7699 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7700 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7701 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7702 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7703 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7704 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7705 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7706 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7707 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7708 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7709 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007710 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007711#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007712 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713#endif
7714
7715#if 0
7716 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007717 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718#endif
7719
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007720 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721 {NULL, NULL}
7722};
7723
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007724static PyObject *
7725unicode_mod(PyObject *v, PyObject *w)
7726{
7727 if (!PyUnicode_Check(v)) {
7728 Py_INCREF(Py_NotImplemented);
7729 return Py_NotImplemented;
7730 }
7731 return PyUnicode_Format(v, w);
7732}
7733
7734static PyNumberMethods unicode_as_number = {
7735 0, /*nb_add*/
7736 0, /*nb_subtract*/
7737 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007738 unicode_mod, /*nb_remainder*/
7739};
7740
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007742 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007743 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007744 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7745 (ssizeargfunc) unicode_getitem, /* sq_item */
7746 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747 0, /* sq_ass_item */
7748 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007749 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750};
7751
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007752static PyObject*
7753unicode_subscript(PyUnicodeObject* self, PyObject* item)
7754{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007755 if (PyIndex_Check(item)) {
7756 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007757 if (i == -1 && PyErr_Occurred())
7758 return NULL;
7759 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007760 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007761 return unicode_getitem(self, i);
7762 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007763 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007764 Py_UNICODE* source_buf;
7765 Py_UNICODE* result_buf;
7766 PyObject* result;
7767
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007768 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007769 &start, &stop, &step, &slicelength) < 0) {
7770 return NULL;
7771 }
7772
7773 if (slicelength <= 0) {
7774 return PyUnicode_FromUnicode(NULL, 0);
7775 } else {
7776 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007777 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7778 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007779
7780 if (result_buf == NULL)
7781 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007782
7783 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7784 result_buf[i] = source_buf[cur];
7785 }
Tim Petersced69f82003-09-16 20:30:58 +00007786
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007787 result = PyUnicode_FromUnicode(result_buf, slicelength);
7788 PyMem_FREE(result_buf);
7789 return result;
7790 }
7791 } else {
7792 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7793 return NULL;
7794 }
7795}
7796
7797static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007798 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007799 (binaryfunc)unicode_subscript, /* mp_subscript */
7800 (objobjargproc)0, /* mp_ass_subscript */
7801};
7802
Martin v. Löwis18e16552006-02-15 17:27:45 +00007803static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007805 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806 const void **ptr)
7807{
7808 if (index != 0) {
7809 PyErr_SetString(PyExc_SystemError,
7810 "accessing non-existent unicode segment");
7811 return -1;
7812 }
7813 *ptr = (void *) self->str;
7814 return PyUnicode_GET_DATA_SIZE(self);
7815}
7816
Martin v. Löwis18e16552006-02-15 17:27:45 +00007817static Py_ssize_t
7818unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819 const void **ptr)
7820{
7821 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007822 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823 return -1;
7824}
7825
7826static int
7827unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007828 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829{
7830 if (lenp)
7831 *lenp = PyUnicode_GET_DATA_SIZE(self);
7832 return 1;
7833}
7834
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007835static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007837 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 const void **ptr)
7839{
7840 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007841
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842 if (index != 0) {
7843 PyErr_SetString(PyExc_SystemError,
7844 "accessing non-existent unicode segment");
7845 return -1;
7846 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007847 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848 if (str == NULL)
7849 return -1;
7850 *ptr = (void *) PyString_AS_STRING(str);
7851 return PyString_GET_SIZE(str);
7852}
7853
7854/* Helpers for PyUnicode_Format() */
7855
7856static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007857getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007859 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860 if (argidx < arglen) {
7861 (*p_argidx)++;
7862 if (arglen < 0)
7863 return args;
7864 else
7865 return PyTuple_GetItem(args, argidx);
7866 }
7867 PyErr_SetString(PyExc_TypeError,
7868 "not enough arguments for format string");
7869 return NULL;
7870}
7871
7872#define F_LJUST (1<<0)
7873#define F_SIGN (1<<1)
7874#define F_BLANK (1<<2)
7875#define F_ALT (1<<3)
7876#define F_ZERO (1<<4)
7877
Martin v. Löwis18e16552006-02-15 17:27:45 +00007878static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007879strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007881 register Py_ssize_t i;
7882 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883 for (i = len - 1; i >= 0; i--)
7884 buffer[i] = (Py_UNICODE) charbuffer[i];
7885
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886 return len;
7887}
7888
Neal Norwitzfc76d632006-01-10 06:03:13 +00007889static int
7890doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7891{
Tim Peters15231542006-02-16 01:08:01 +00007892 Py_ssize_t result;
7893
Neal Norwitzfc76d632006-01-10 06:03:13 +00007894 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007895 result = strtounicode(buffer, (char *)buffer);
7896 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007897}
7898
7899static int
7900longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7901{
Tim Peters15231542006-02-16 01:08:01 +00007902 Py_ssize_t result;
7903
Neal Norwitzfc76d632006-01-10 06:03:13 +00007904 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007905 result = strtounicode(buffer, (char *)buffer);
7906 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007907}
7908
Guido van Rossum078151d2002-08-11 04:24:12 +00007909/* XXX To save some code duplication, formatfloat/long/int could have been
7910 shared with stringobject.c, converting from 8-bit to Unicode after the
7911 formatting is done. */
7912
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913static int
7914formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007915 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916 int flags,
7917 int prec,
7918 int type,
7919 PyObject *v)
7920{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007921 /* fmt = '%#.' + `prec` + `type`
7922 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007923 char fmt[20];
7924 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007925
Guido van Rossumd57fd912000-03-10 22:53:23 +00007926 x = PyFloat_AsDouble(v);
7927 if (x == -1.0 && PyErr_Occurred())
7928 return -1;
7929 if (prec < 0)
7930 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007931 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7932 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007933 /* Worst case length calc to ensure no buffer overrun:
7934
7935 'g' formats:
7936 fmt = %#.<prec>g
7937 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7938 for any double rep.)
7939 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7940
7941 'f' formats:
7942 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7943 len = 1 + 50 + 1 + prec = 52 + prec
7944
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007945 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007946 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007947
7948 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00007949 if (((type == 'g' || type == 'G') &&
7950 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007951 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007952 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007953 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007954 return -1;
7955 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007956 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7957 (flags&F_ALT) ? "#" : "",
7958 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007959 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960}
7961
Tim Peters38fd5b62000-09-21 05:43:11 +00007962static PyObject*
7963formatlong(PyObject *val, int flags, int prec, int type)
7964{
7965 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00007966 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00007967 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00007968 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00007969
7970 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7971 if (!str)
7972 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00007973 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00007974 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00007975 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00007976}
7977
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978static int
7979formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007980 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 int flags,
7982 int prec,
7983 int type,
7984 PyObject *v)
7985{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007986 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007987 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7988 * + 1 + 1
7989 * = 24
7990 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007991 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007992 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993 long x;
7994
7995 x = PyInt_AsLong(v);
7996 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007997 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007998 if (x < 0 && type == 'u') {
7999 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008000 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008001 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8002 sign = "-";
8003 else
8004 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008006 prec = 1;
8007
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008008 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8009 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008010 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008011 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008012 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008013 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008014 return -1;
8015 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008016
8017 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008018 (type == 'x' || type == 'X' || type == 'o')) {
8019 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008020 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008021 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008022 * - when 0 is being converted, the C standard leaves off
8023 * the '0x' or '0X', which is inconsistent with other
8024 * %#x/%#X conversions and inconsistent with Python's
8025 * hex() function
8026 * - there are platforms that violate the standard and
8027 * convert 0 with the '0x' or '0X'
8028 * (Metrowerks, Compaq Tru64)
8029 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008030 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008031 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008032 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008033 * We can achieve the desired consistency by inserting our
8034 * own '0x' or '0X' prefix, and substituting %x/%X in place
8035 * of %#x/%#X.
8036 *
8037 * Note that this is the same approach as used in
8038 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008039 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008040 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8041 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008042 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008043 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008044 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8045 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008046 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008047 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008048 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008049 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008050 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008051 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052}
8053
8054static int
8055formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008056 size_t buflen,
8057 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008059 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008060 if (PyUnicode_Check(v)) {
8061 if (PyUnicode_GET_SIZE(v) != 1)
8062 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008064 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008066 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008067 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008068 goto onError;
8069 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8070 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071
8072 else {
8073 /* Integer input truncated to a character */
8074 long x;
8075 x = PyInt_AsLong(v);
8076 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008077 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008078#ifdef Py_UNICODE_WIDE
8079 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008080 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008081 "%c arg not in range(0x110000) "
8082 "(wide Python build)");
8083 return -1;
8084 }
8085#else
8086 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008087 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008088 "%c arg not in range(0x10000) "
8089 "(narrow Python build)");
8090 return -1;
8091 }
8092#endif
8093 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094 }
8095 buf[1] = '\0';
8096 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008097
8098 onError:
8099 PyErr_SetString(PyExc_TypeError,
8100 "%c requires int or char");
8101 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102}
8103
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008104/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8105
8106 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8107 chars are formatted. XXX This is a magic number. Each formatting
8108 routine does bounds checking to ensure no overflow, but a better
8109 solution may be to malloc a buffer of appropriate size for each
8110 format. For now, the current solution is sufficient.
8111*/
8112#define FORMATBUFLEN (size_t)120
8113
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114PyObject *PyUnicode_Format(PyObject *format,
8115 PyObject *args)
8116{
8117 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008118 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119 int args_owned = 0;
8120 PyUnicodeObject *result = NULL;
8121 PyObject *dict = NULL;
8122 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008123
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124 if (format == NULL || args == NULL) {
8125 PyErr_BadInternalCall();
8126 return NULL;
8127 }
8128 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008129 if (uformat == NULL)
8130 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 fmt = PyUnicode_AS_UNICODE(uformat);
8132 fmtcnt = PyUnicode_GET_SIZE(uformat);
8133
8134 reslen = rescnt = fmtcnt + 100;
8135 result = _PyUnicode_New(reslen);
8136 if (result == NULL)
8137 goto onError;
8138 res = PyUnicode_AS_UNICODE(result);
8139
8140 if (PyTuple_Check(args)) {
8141 arglen = PyTuple_Size(args);
8142 argidx = 0;
8143 }
8144 else {
8145 arglen = -1;
8146 argidx = -2;
8147 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008148 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008149 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150 dict = args;
8151
8152 while (--fmtcnt >= 0) {
8153 if (*fmt != '%') {
8154 if (--rescnt < 0) {
8155 rescnt = fmtcnt + 100;
8156 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008157 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008158 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8160 --rescnt;
8161 }
8162 *res++ = *fmt++;
8163 }
8164 else {
8165 /* Got a format specifier */
8166 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008167 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169 Py_UNICODE c = '\0';
8170 Py_UNICODE fill;
8171 PyObject *v = NULL;
8172 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008173 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008175 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008176 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177
8178 fmt++;
8179 if (*fmt == '(') {
8180 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008181 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182 PyObject *key;
8183 int pcount = 1;
8184
8185 if (dict == NULL) {
8186 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008187 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188 goto onError;
8189 }
8190 ++fmt;
8191 --fmtcnt;
8192 keystart = fmt;
8193 /* Skip over balanced parentheses */
8194 while (pcount > 0 && --fmtcnt >= 0) {
8195 if (*fmt == ')')
8196 --pcount;
8197 else if (*fmt == '(')
8198 ++pcount;
8199 fmt++;
8200 }
8201 keylen = fmt - keystart - 1;
8202 if (fmtcnt < 0 || pcount > 0) {
8203 PyErr_SetString(PyExc_ValueError,
8204 "incomplete format key");
8205 goto onError;
8206 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008207#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008208 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209 then looked up since Python uses strings to hold
8210 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008211 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212 key = PyUnicode_EncodeUTF8(keystart,
8213 keylen,
8214 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008215#else
8216 key = PyUnicode_FromUnicode(keystart, keylen);
8217#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218 if (key == NULL)
8219 goto onError;
8220 if (args_owned) {
8221 Py_DECREF(args);
8222 args_owned = 0;
8223 }
8224 args = PyObject_GetItem(dict, key);
8225 Py_DECREF(key);
8226 if (args == NULL) {
8227 goto onError;
8228 }
8229 args_owned = 1;
8230 arglen = -1;
8231 argidx = -2;
8232 }
8233 while (--fmtcnt >= 0) {
8234 switch (c = *fmt++) {
8235 case '-': flags |= F_LJUST; continue;
8236 case '+': flags |= F_SIGN; continue;
8237 case ' ': flags |= F_BLANK; continue;
8238 case '#': flags |= F_ALT; continue;
8239 case '0': flags |= F_ZERO; continue;
8240 }
8241 break;
8242 }
8243 if (c == '*') {
8244 v = getnextarg(args, arglen, &argidx);
8245 if (v == NULL)
8246 goto onError;
8247 if (!PyInt_Check(v)) {
8248 PyErr_SetString(PyExc_TypeError,
8249 "* wants int");
8250 goto onError;
8251 }
8252 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008253 if (width == -1 && PyErr_Occurred())
8254 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255 if (width < 0) {
8256 flags |= F_LJUST;
8257 width = -width;
8258 }
8259 if (--fmtcnt >= 0)
8260 c = *fmt++;
8261 }
8262 else if (c >= '0' && c <= '9') {
8263 width = c - '0';
8264 while (--fmtcnt >= 0) {
8265 c = *fmt++;
8266 if (c < '0' || c > '9')
8267 break;
8268 if ((width*10) / 10 != width) {
8269 PyErr_SetString(PyExc_ValueError,
8270 "width too big");
8271 goto onError;
8272 }
8273 width = width*10 + (c - '0');
8274 }
8275 }
8276 if (c == '.') {
8277 prec = 0;
8278 if (--fmtcnt >= 0)
8279 c = *fmt++;
8280 if (c == '*') {
8281 v = getnextarg(args, arglen, &argidx);
8282 if (v == NULL)
8283 goto onError;
8284 if (!PyInt_Check(v)) {
8285 PyErr_SetString(PyExc_TypeError,
8286 "* wants int");
8287 goto onError;
8288 }
8289 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008290 if (prec == -1 && PyErr_Occurred())
8291 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292 if (prec < 0)
8293 prec = 0;
8294 if (--fmtcnt >= 0)
8295 c = *fmt++;
8296 }
8297 else if (c >= '0' && c <= '9') {
8298 prec = c - '0';
8299 while (--fmtcnt >= 0) {
8300 c = Py_CHARMASK(*fmt++);
8301 if (c < '0' || c > '9')
8302 break;
8303 if ((prec*10) / 10 != prec) {
8304 PyErr_SetString(PyExc_ValueError,
8305 "prec too big");
8306 goto onError;
8307 }
8308 prec = prec*10 + (c - '0');
8309 }
8310 }
8311 } /* prec */
8312 if (fmtcnt >= 0) {
8313 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314 if (--fmtcnt >= 0)
8315 c = *fmt++;
8316 }
8317 }
8318 if (fmtcnt < 0) {
8319 PyErr_SetString(PyExc_ValueError,
8320 "incomplete format");
8321 goto onError;
8322 }
8323 if (c != '%') {
8324 v = getnextarg(args, arglen, &argidx);
8325 if (v == NULL)
8326 goto onError;
8327 }
8328 sign = 0;
8329 fill = ' ';
8330 switch (c) {
8331
8332 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008333 pbuf = formatbuf;
8334 /* presume that buffer length is at least 1 */
8335 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336 len = 1;
8337 break;
8338
8339 case 's':
8340 case 'r':
8341 if (PyUnicode_Check(v) && c == 's') {
8342 temp = v;
8343 Py_INCREF(temp);
8344 }
8345 else {
8346 PyObject *unicode;
8347 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008348 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349 else
8350 temp = PyObject_Repr(v);
8351 if (temp == NULL)
8352 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008353 if (PyUnicode_Check(temp))
8354 /* nothing to do */;
8355 else if (PyString_Check(temp)) {
8356 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008357 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008359 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008361 Py_DECREF(temp);
8362 temp = unicode;
8363 if (temp == NULL)
8364 goto onError;
8365 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008366 else {
8367 Py_DECREF(temp);
8368 PyErr_SetString(PyExc_TypeError,
8369 "%s argument has non-string str()");
8370 goto onError;
8371 }
8372 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008373 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374 len = PyUnicode_GET_SIZE(temp);
8375 if (prec >= 0 && len > prec)
8376 len = prec;
8377 break;
8378
8379 case 'i':
8380 case 'd':
8381 case 'u':
8382 case 'o':
8383 case 'x':
8384 case 'X':
8385 if (c == 'i')
8386 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008387 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008388 temp = formatlong(v, flags, prec, c);
8389 if (!temp)
8390 goto onError;
8391 pbuf = PyUnicode_AS_UNICODE(temp);
8392 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008393 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008395 else {
8396 pbuf = formatbuf;
8397 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8398 flags, prec, c, v);
8399 if (len < 0)
8400 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008401 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008402 }
8403 if (flags & F_ZERO)
8404 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405 break;
8406
8407 case 'e':
8408 case 'E':
8409 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008410 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008411 case 'g':
8412 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008413 if (c == 'F')
8414 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008415 pbuf = formatbuf;
8416 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8417 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008418 if (len < 0)
8419 goto onError;
8420 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008421 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422 fill = '0';
8423 break;
8424
8425 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008426 pbuf = formatbuf;
8427 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428 if (len < 0)
8429 goto onError;
8430 break;
8431
8432 default:
8433 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008434 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008435 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008436 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008437 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008438 (Py_ssize_t)(fmt - 1 -
8439 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008440 goto onError;
8441 }
8442 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008443 if (*pbuf == '-' || *pbuf == '+') {
8444 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445 len--;
8446 }
8447 else if (flags & F_SIGN)
8448 sign = '+';
8449 else if (flags & F_BLANK)
8450 sign = ' ';
8451 else
8452 sign = 0;
8453 }
8454 if (width < len)
8455 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008456 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457 reslen -= rescnt;
8458 rescnt = width + fmtcnt + 100;
8459 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008460 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008461 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008462 PyErr_NoMemory();
8463 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008464 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008465 if (_PyUnicode_Resize(&result, reslen) < 0) {
8466 Py_XDECREF(temp);
8467 goto onError;
8468 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469 res = PyUnicode_AS_UNICODE(result)
8470 + reslen - rescnt;
8471 }
8472 if (sign) {
8473 if (fill != ' ')
8474 *res++ = sign;
8475 rescnt--;
8476 if (width > len)
8477 width--;
8478 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008479 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008480 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008481 assert(pbuf[1] == c);
8482 if (fill != ' ') {
8483 *res++ = *pbuf++;
8484 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008485 }
Tim Petersfff53252001-04-12 18:38:48 +00008486 rescnt -= 2;
8487 width -= 2;
8488 if (width < 0)
8489 width = 0;
8490 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008491 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492 if (width > len && !(flags & F_LJUST)) {
8493 do {
8494 --rescnt;
8495 *res++ = fill;
8496 } while (--width > len);
8497 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008498 if (fill == ' ') {
8499 if (sign)
8500 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008501 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008502 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008503 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008504 *res++ = *pbuf++;
8505 *res++ = *pbuf++;
8506 }
8507 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008508 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008509 res += len;
8510 rescnt -= len;
8511 while (--width >= len) {
8512 --rescnt;
8513 *res++ = ' ';
8514 }
8515 if (dict && (argidx < arglen) && c != '%') {
8516 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008517 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008518 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519 goto onError;
8520 }
8521 Py_XDECREF(temp);
8522 } /* '%' */
8523 } /* until end */
8524 if (argidx < arglen && !dict) {
8525 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008526 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527 goto onError;
8528 }
8529
Thomas Woutersa96affe2006-03-12 00:29:36 +00008530 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8531 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532 if (args_owned) {
8533 Py_DECREF(args);
8534 }
8535 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536 return (PyObject *)result;
8537
8538 onError:
8539 Py_XDECREF(result);
8540 Py_DECREF(uformat);
8541 if (args_owned) {
8542 Py_DECREF(args);
8543 }
8544 return NULL;
8545}
8546
8547static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008548 (readbufferproc) unicode_buffer_getreadbuf,
8549 (writebufferproc) unicode_buffer_getwritebuf,
8550 (segcountproc) unicode_buffer_getsegcount,
8551 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552};
8553
Jeremy Hylton938ace62002-07-17 16:30:39 +00008554static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008555unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8556
Tim Peters6d6c1a32001-08-02 04:15:00 +00008557static PyObject *
8558unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8559{
8560 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008561 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008562 char *encoding = NULL;
8563 char *errors = NULL;
8564
Guido van Rossume023fe02001-08-30 03:12:59 +00008565 if (type != &PyUnicode_Type)
8566 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008567 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8568 kwlist, &x, &encoding, &errors))
8569 return NULL;
8570 if (x == NULL)
8571 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008572 if (encoding == NULL && errors == NULL)
8573 return PyObject_Unicode(x);
8574 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008575 return PyUnicode_FromEncodedObject(x, encoding, errors);
8576}
8577
Guido van Rossume023fe02001-08-30 03:12:59 +00008578static PyObject *
8579unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8580{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008581 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008582 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008583
8584 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8585 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8586 if (tmp == NULL)
8587 return NULL;
8588 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008589 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008590 if (pnew == NULL) {
8591 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008592 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008593 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008594 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8595 if (pnew->str == NULL) {
8596 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008597 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008598 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008599 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008600 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008601 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8602 pnew->length = n;
8603 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008604 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008605 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008606}
8607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008608PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008609"unicode(string [, encoding[, errors]]) -> object\n\
8610\n\
8611Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008612encoding defaults to the current default string encoding.\n\
8613errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008614
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008615static PyObject *unicode_iter(PyObject *seq);
8616
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008618 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008619 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620 sizeof(PyUnicodeObject), /* tp_size */
8621 0, /* tp_itemsize */
8622 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008623 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008625 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008627 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008628 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008629 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008631 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632 (hashfunc) unicode_hash, /* tp_hash*/
8633 0, /* tp_call*/
8634 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008635 PyObject_GenericGetAttr, /* tp_getattro */
8636 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008638 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8639 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008640 unicode_doc, /* tp_doc */
8641 0, /* tp_traverse */
8642 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008643 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008644 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008645 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008646 0, /* tp_iternext */
8647 unicode_methods, /* tp_methods */
8648 0, /* tp_members */
8649 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008650 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008651 0, /* tp_dict */
8652 0, /* tp_descr_get */
8653 0, /* tp_descr_set */
8654 0, /* tp_dictoffset */
8655 0, /* tp_init */
8656 0, /* tp_alloc */
8657 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008658 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659};
8660
8661/* Initialize the Unicode implementation */
8662
Thomas Wouters78890102000-07-22 19:25:51 +00008663void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008665 int i;
8666
Thomas Wouters477c8d52006-05-27 19:21:47 +00008667 /* XXX - move this array to unicodectype.c ? */
8668 Py_UNICODE linebreak[] = {
8669 0x000A, /* LINE FEED */
8670 0x000D, /* CARRIAGE RETURN */
8671 0x001C, /* FILE SEPARATOR */
8672 0x001D, /* GROUP SEPARATOR */
8673 0x001E, /* RECORD SEPARATOR */
8674 0x0085, /* NEXT LINE */
8675 0x2028, /* LINE SEPARATOR */
8676 0x2029, /* PARAGRAPH SEPARATOR */
8677 };
8678
Fred Drakee4315f52000-05-09 19:53:39 +00008679 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008680 unicode_freelist = NULL;
8681 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008683 if (!unicode_empty)
8684 return;
8685
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008686 for (i = 0; i < 256; i++)
8687 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008688 if (PyType_Ready(&PyUnicode_Type) < 0)
8689 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008690
8691 /* initialize the linebreak bloom filter */
8692 bloom_linebreak = make_bloom_mask(
8693 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8694 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008695
8696 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697}
8698
8699/* Finalize the Unicode implementation */
8700
8701void
Thomas Wouters78890102000-07-22 19:25:51 +00008702_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008704 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008705 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008706
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008707 Py_XDECREF(unicode_empty);
8708 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008709
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008710 for (i = 0; i < 256; i++) {
8711 if (unicode_latin1[i]) {
8712 Py_DECREF(unicode_latin1[i]);
8713 unicode_latin1[i] = NULL;
8714 }
8715 }
8716
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008717 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008718 PyUnicodeObject *v = u;
8719 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008720 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008721 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008722 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008723 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008725 unicode_freelist = NULL;
8726 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008728
Walter Dörwald16807132007-05-25 13:52:07 +00008729void
8730PyUnicode_InternInPlace(PyObject **p)
8731{
8732 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8733 PyObject *t;
8734 if (s == NULL || !PyUnicode_Check(s))
8735 Py_FatalError(
8736 "PyUnicode_InternInPlace: unicode strings only please!");
8737 /* If it's a subclass, we don't really know what putting
8738 it in the interned dict might do. */
8739 if (!PyUnicode_CheckExact(s))
8740 return;
8741 if (PyUnicode_CHECK_INTERNED(s))
8742 return;
8743 if (interned == NULL) {
8744 interned = PyDict_New();
8745 if (interned == NULL) {
8746 PyErr_Clear(); /* Don't leave an exception */
8747 return;
8748 }
8749 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008750 /* It might be that the GetItem call fails even
8751 though the key is present in the dictionary,
8752 namely when this happens during a stack overflow. */
8753 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00008754 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00008755 Py_END_ALLOW_RECURSION
8756
Walter Dörwald16807132007-05-25 13:52:07 +00008757 if (t) {
8758 Py_INCREF(t);
8759 Py_DECREF(*p);
8760 *p = t;
8761 return;
8762 }
8763
Martin v. Löwis5b222132007-06-10 09:51:05 +00008764 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008765 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
8766 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00008767 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008768 return;
8769 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008770 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008771 /* The two references in interned are not counted by refcnt.
8772 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008773 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00008774 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
8775}
8776
8777void
8778PyUnicode_InternImmortal(PyObject **p)
8779{
8780 PyUnicode_InternInPlace(p);
8781 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
8782 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
8783 Py_INCREF(*p);
8784 }
8785}
8786
8787PyObject *
8788PyUnicode_InternFromString(const char *cp)
8789{
8790 PyObject *s = PyUnicode_FromString(cp);
8791 if (s == NULL)
8792 return NULL;
8793 PyUnicode_InternInPlace(&s);
8794 return s;
8795}
8796
8797void _Py_ReleaseInternedUnicodeStrings(void)
8798{
8799 PyObject *keys;
8800 PyUnicodeObject *s;
8801 Py_ssize_t i, n;
8802 Py_ssize_t immortal_size = 0, mortal_size = 0;
8803
8804 if (interned == NULL || !PyDict_Check(interned))
8805 return;
8806 keys = PyDict_Keys(interned);
8807 if (keys == NULL || !PyList_Check(keys)) {
8808 PyErr_Clear();
8809 return;
8810 }
8811
8812 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
8813 detector, interned unicode strings are not forcibly deallocated;
8814 rather, we give them their stolen references back, and then clear
8815 and DECREF the interned dict. */
8816
8817 n = PyList_GET_SIZE(keys);
8818 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
8819 n);
8820 for (i = 0; i < n; i++) {
8821 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
8822 switch (s->state) {
8823 case SSTATE_NOT_INTERNED:
8824 /* XXX Shouldn't happen */
8825 break;
8826 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008827 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008828 immortal_size += s->length;
8829 break;
8830 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008831 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00008832 mortal_size += s->length;
8833 break;
8834 default:
8835 Py_FatalError("Inconsistent interned string state.");
8836 }
8837 s->state = SSTATE_NOT_INTERNED;
8838 }
8839 fprintf(stderr, "total size of all interned strings: "
8840 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
8841 "mortal/immortal\n", mortal_size, immortal_size);
8842 Py_DECREF(keys);
8843 PyDict_Clear(interned);
8844 Py_DECREF(interned);
8845 interned = NULL;
8846}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008847
8848
8849/********************* Unicode Iterator **************************/
8850
8851typedef struct {
8852 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008853 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008854 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8855} unicodeiterobject;
8856
8857static void
8858unicodeiter_dealloc(unicodeiterobject *it)
8859{
8860 _PyObject_GC_UNTRACK(it);
8861 Py_XDECREF(it->it_seq);
8862 PyObject_GC_Del(it);
8863}
8864
8865static int
8866unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8867{
8868 Py_VISIT(it->it_seq);
8869 return 0;
8870}
8871
8872static PyObject *
8873unicodeiter_next(unicodeiterobject *it)
8874{
8875 PyUnicodeObject *seq;
8876 PyObject *item;
8877
8878 assert(it != NULL);
8879 seq = it->it_seq;
8880 if (seq == NULL)
8881 return NULL;
8882 assert(PyUnicode_Check(seq));
8883
8884 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008885 item = PyUnicode_FromUnicode(
8886 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008887 if (item != NULL)
8888 ++it->it_index;
8889 return item;
8890 }
8891
8892 Py_DECREF(seq);
8893 it->it_seq = NULL;
8894 return NULL;
8895}
8896
8897static PyObject *
8898unicodeiter_len(unicodeiterobject *it)
8899{
8900 Py_ssize_t len = 0;
8901 if (it->it_seq)
8902 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8903 return PyInt_FromSsize_t(len);
8904}
8905
8906PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8907
8908static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008909 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8910 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008911 {NULL, NULL} /* sentinel */
8912};
8913
8914PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008915 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008916 "unicodeiterator", /* tp_name */
8917 sizeof(unicodeiterobject), /* tp_basicsize */
8918 0, /* tp_itemsize */
8919 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008920 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008921 0, /* tp_print */
8922 0, /* tp_getattr */
8923 0, /* tp_setattr */
8924 0, /* tp_compare */
8925 0, /* tp_repr */
8926 0, /* tp_as_number */
8927 0, /* tp_as_sequence */
8928 0, /* tp_as_mapping */
8929 0, /* tp_hash */
8930 0, /* tp_call */
8931 0, /* tp_str */
8932 PyObject_GenericGetAttr, /* tp_getattro */
8933 0, /* tp_setattro */
8934 0, /* tp_as_buffer */
8935 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8936 0, /* tp_doc */
8937 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8938 0, /* tp_clear */
8939 0, /* tp_richcompare */
8940 0, /* tp_weaklistoffset */
8941 PyObject_SelfIter, /* tp_iter */
8942 (iternextfunc)unicodeiter_next, /* tp_iternext */
8943 unicodeiter_methods, /* tp_methods */
8944 0,
8945};
8946
8947static PyObject *
8948unicode_iter(PyObject *seq)
8949{
8950 unicodeiterobject *it;
8951
8952 if (!PyUnicode_Check(seq)) {
8953 PyErr_BadInternalCall();
8954 return NULL;
8955 }
8956 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8957 if (it == NULL)
8958 return NULL;
8959 it->it_index = 0;
8960 Py_INCREF(seq);
8961 it->it_seq = (PyUnicodeObject *)seq;
8962 _PyObject_GC_TRACK(it);
8963 return (PyObject *)it;
8964}
8965
Martin v. Löwis5b222132007-06-10 09:51:05 +00008966size_t
8967Py_UNICODE_strlen(const Py_UNICODE *u)
8968{
8969 int res = 0;
8970 while(*u++)
8971 res++;
8972 return res;
8973}
8974
8975Py_UNICODE*
8976Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
8977{
8978 Py_UNICODE *u = s1;
8979 while ((*u++ = *s2++));
8980 return s1;
8981}
8982
8983Py_UNICODE*
8984Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
8985{
8986 Py_UNICODE *u = s1;
8987 while ((*u++ = *s2++))
8988 if (n-- == 0)
8989 break;
8990 return s1;
8991}
8992
8993int
8994Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
8995{
8996 while (*s1 && *s2 && *s1 == *s2)
8997 s1++, s2++;
8998 if (*s1 && *s2)
8999 return (*s1 < *s2) ? -1 : +1;
9000 if (*s1)
9001 return 1;
9002 if (*s2)
9003 return -1;
9004 return 0;
9005}
9006
9007Py_UNICODE*
9008Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9009{
9010 const Py_UNICODE *p;
9011 for (p = s; *p; p++)
9012 if (*p == c)
9013 return (Py_UNICODE*)p;
9014 return NULL;
9015}
9016
9017
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009018#ifdef __cplusplus
9019}
9020#endif
9021
9022
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009023/*
9024Local variables:
9025c-basic-offset: 4
9026indent-tabs-mode: nil
9027End:
9028*/