blob: bf031bb4e94158276d6a3c0a80f046df424e7c16 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
101 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000106static PyUnicodeObject *unicode_freelist;
107static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Fred Drakee4315f52000-05-09 19:53:39 +0000116/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000117 parameter; it is fixed to "utf-8". Always use the
118 PyUnicode_GetDefaultEncoding() API to access this global. */
119static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000120
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000121Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000122PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000124#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125 return 0x10FFFF;
126#else
127 /* This is actually an illegal character, so it should
128 not be passed to unichr. */
129 return 0xFFFF;
130#endif
131}
132
Thomas Wouters477c8d52006-05-27 19:21:47 +0000133/* --- Bloom Filters ----------------------------------------------------- */
134
135/* stuff to implement simple "bloom filters" for Unicode characters.
136 to keep things simple, we use a single bitmask, using the least 5
137 bits from each unicode characters as the bit index. */
138
139/* the linebreak mask is set up by Unicode_Init below */
140
141#define BLOOM_MASK unsigned long
142
143static BLOOM_MASK bloom_linebreak;
144
145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
146
147#define BLOOM_LINEBREAK(ch)\
148 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
149
150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
151{
152 /* calculate simple bloom-style bitmask for a given unicode string */
153
154 long mask;
155 Py_ssize_t i;
156
157 mask = 0;
158 for (i = 0; i < len; i++)
159 mask |= (1 << (ptr[i] & 0x1F));
160
161 return mask;
162}
163
164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
165{
166 Py_ssize_t i;
167
168 for (i = 0; i < setlen; i++)
169 if (set[i] == chr)
170 return 1;
171
172 return 0;
173}
174
175#define BLOOM_MEMBER(mask, chr, set, setlen)\
176 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
177
Guido van Rossumd57fd912000-03-10 22:53:23 +0000178/* --- Unicode Object ----------------------------------------------------- */
179
180static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000182 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183{
184 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000185
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000186 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190 /* Resizing shared object (unicode_empty or single character
191 objects) in-place is not allowed. Use PyUnicode_Resize()
192 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000194 if (unicode == unicode_empty ||
195 (unicode->length == 1 &&
196 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 return -1;
201 }
202
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203 /* We allocate one more byte to make sure the string is Ux0000 terminated.
204 The overallocation is also used by fastsearch, which assumes that it's
205 safe to look at str[length] (without making any assumptions about what
206 it contains). */
207
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 oldstr = unicode->str;
209 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
210 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 PyErr_NoMemory();
213 return -1;
214 }
215 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000216 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000220 if (unicode->defenc) {
221 Py_DECREF(unicode->defenc);
222 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 }
224 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000225
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 return 0;
227}
228
229/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000230 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231
232 XXX This allocator could further be enhanced by assuring that the
233 free list never reduces its size below 1.
234
235*/
236
237static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000238PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239{
240 register PyUnicodeObject *unicode;
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (length == 0 && unicode_empty != NULL) {
244 Py_INCREF(unicode_empty);
245 return unicode_empty;
246 }
247
248 /* Unicode freelist & memory allocation */
249 if (unicode_freelist) {
250 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000251 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 /* Keep-Alive optimization: we only upsize the buffer,
255 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000256 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000257 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000258 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000259 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000262 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000264 }
265 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 }
267 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000268 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 if (unicode == NULL)
270 return NULL;
271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
272 }
273
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000274 if (!unicode->str) {
275 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000276 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000278 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000279 * the caller fails before initializing str -- unicode_resize()
280 * reads str[0], and the Keep-Alive optimization can keep memory
281 * allocated for str alive across a call to unicode_dealloc(unicode).
282 * We don't want unicode_resize to read uninitialized memory in
283 * that case.
284 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000285 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000289 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000292
293 onError:
294 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000295 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297}
298
299static
Guido van Rossum9475a232001-10-05 20:51:39 +0000300void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
Walter Dörwald16807132007-05-25 13:52:07 +0000302 switch (PyUnicode_CHECK_INTERNED(unicode)) {
303 case SSTATE_NOT_INTERNED:
304 break;
305
306 case SSTATE_INTERNED_MORTAL:
307 /* revive dead object temporarily for DelItem */
308 unicode->ob_refcnt = 3;
309 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
310 Py_FatalError(
311 "deletion of interned unicode string failed");
312 break;
313
314 case SSTATE_INTERNED_IMMORTAL:
315 Py_FatalError("Immortal interned unicode string died.");
316
317 default:
318 Py_FatalError("Inconsistent interned unicode string state.");
319 }
320
Guido van Rossum604ddf82001-12-06 20:03:56 +0000321 if (PyUnicode_CheckExact(unicode) &&
322 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000323 /* Keep-Alive optimization */
324 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000325 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 unicode->str = NULL;
327 unicode->length = 0;
328 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000329 if (unicode->defenc) {
330 Py_DECREF(unicode->defenc);
331 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000332 }
333 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 *(PyUnicodeObject **)unicode = unicode_freelist;
335 unicode_freelist = unicode;
336 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 }
338 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000339 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000340 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000341 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 }
343}
344
Martin v. Löwis18e16552006-02-15 17:27:45 +0000345int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000346{
347 register PyUnicodeObject *v;
348
349 /* Argument checks */
350 if (unicode == NULL) {
351 PyErr_BadInternalCall();
352 return -1;
353 }
354 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000355 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000356 PyErr_BadInternalCall();
357 return -1;
358 }
359
360 /* Resizing unicode_empty and single character objects is not
361 possible since these are being shared. We simply return a fresh
362 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000363 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 (v == unicode_empty || v->length == 1)) {
365 PyUnicodeObject *w = _PyUnicode_New(length);
366 if (w == NULL)
367 return -1;
368 Py_UNICODE_COPY(w->str, v->str,
369 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000370 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000371 *unicode = (PyObject *)w;
372 return 0;
373 }
374
375 /* Note that we don't have to modify *unicode for unshared Unicode
376 objects, since we can modify them in-place. */
377 return unicode_resize(v, length);
378}
379
380/* Internal API for use in unicodeobject.c only ! */
381#define _PyUnicode_Resize(unicodevar, length) \
382 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
383
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000385 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386{
387 PyUnicodeObject *unicode;
388
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000389 /* If the Unicode data is known at construction time, we can apply
390 some optimizations which share commonly used objects. */
391 if (u != NULL) {
392
393 /* Optimization for empty strings */
394 if (size == 0 && unicode_empty != NULL) {
395 Py_INCREF(unicode_empty);
396 return (PyObject *)unicode_empty;
397 }
398
399 /* Single character Unicode objects in the Latin-1 range are
400 shared when using this constructor */
401 if (size == 1 && *u < 256) {
402 unicode = unicode_latin1[*u];
403 if (!unicode) {
404 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000405 if (!unicode)
406 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000407 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 unicode_latin1[*u] = unicode;
409 }
410 Py_INCREF(unicode);
411 return (PyObject *)unicode;
412 }
413 }
Tim Petersced69f82003-09-16 20:30:58 +0000414
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 unicode = _PyUnicode_New(size);
416 if (!unicode)
417 return NULL;
418
419 /* Copy the Unicode data into the new object */
420 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000421 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000422
423 return (PyObject *)unicode;
424}
425
Walter Dörwaldd2034312007-05-18 16:29:38 +0000426PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000427{
428 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000429 /* If the Unicode data is known at construction time, we can apply
430 some optimizations which share commonly used objects. */
431 if (u != NULL) {
432
433 /* Optimization for empty strings */
434 if (size == 0 && unicode_empty != NULL) {
435 Py_INCREF(unicode_empty);
436 return (PyObject *)unicode_empty;
437 }
438
Walter Dörwald071b9da2007-05-05 14:21:20 +0000439 /* Single characters are shared when using this constructor */
440 if (size == 1) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000441 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000442 if (!unicode) {
443 unicode = _PyUnicode_New(1);
444 if (!unicode)
445 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000446 unicode->str[0] = Py_CHARMASK(*u);
447 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000448 }
449 Py_INCREF(unicode);
450 return (PyObject *)unicode;
451 }
452 }
453
Walter Dörwald55507312007-05-18 13:12:10 +0000454 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000455 if (!unicode)
456 return NULL;
457
458 /* Copy the Unicode data into the new object */
459 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000460 Py_UNICODE *p = unicode->str;
Martin v. Löwis5b222132007-06-10 09:51:05 +0000461 while (size--)
Guido van Rossum00058aa2007-07-19 18:21:28 +0000462 *p++ = Py_CHARMASK(*u++);
Martin v. Löwis5b222132007-06-10 09:51:05 +0000463 /* Don't need to write trailing 0 because
464 that's already done by _PyUnicode_New */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000465 }
466
467 return (PyObject *)unicode;
468}
469
Walter Dörwaldd2034312007-05-18 16:29:38 +0000470PyObject *PyUnicode_FromString(const char *u)
471{
472 size_t size = strlen(u);
473 if (size > PY_SSIZE_T_MAX) {
474 PyErr_SetString(PyExc_OverflowError, "input too long");
475 return NULL;
476 }
477
478 return PyUnicode_FromStringAndSize(u, size);
479}
480
Guido van Rossumd57fd912000-03-10 22:53:23 +0000481#ifdef HAVE_WCHAR_H
482
483PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000484 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485{
486 PyUnicodeObject *unicode;
487
488 if (w == NULL) {
489 PyErr_BadInternalCall();
490 return NULL;
491 }
492
493 unicode = _PyUnicode_New(size);
494 if (!unicode)
495 return NULL;
496
497 /* Copy the wchar_t data into the new object */
498#ifdef HAVE_USABLE_WCHAR_T
499 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000500#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501 {
502 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000503 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000505 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 *u++ = *w++;
507 }
508#endif
509
510 return (PyObject *)unicode;
511}
512
Walter Dörwald346737f2007-05-31 10:44:43 +0000513static void
514makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
515{
516 *fmt++ = '%';
517 if (width) {
518 if (zeropad)
519 *fmt++ = '0';
520 fmt += sprintf(fmt, "%d", width);
521 }
522 if (precision)
523 fmt += sprintf(fmt, ".%d", precision);
524 if (longflag)
525 *fmt++ = 'l';
526 else if (size_tflag) {
527 char *f = PY_FORMAT_SIZE_T;
528 while (*f)
529 *fmt++ = *f++;
530 }
531 *fmt++ = c;
532 *fmt = '\0';
533}
534
Walter Dörwaldd2034312007-05-18 16:29:38 +0000535#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
536
537PyObject *
538PyUnicode_FromFormatV(const char *format, va_list vargs)
539{
540 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000541 Py_ssize_t callcount = 0;
542 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000543 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000544 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000545 int width = 0;
546 int precision = 0;
547 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000548 const char* f;
549 Py_UNICODE *s;
550 PyObject *string;
551 /* used by sprintf */
552 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000553 /* use abuffer instead of buffer, if we need more space
554 * (which can happen if there's a format specifier with width). */
555 char *abuffer = NULL;
556 char *realbuffer;
557 Py_ssize_t abuffersize = 0;
558 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000559 const char *copy;
560
561#ifdef VA_LIST_IS_ARRAY
562 Py_MEMCPY(count, vargs, sizeof(va_list));
563#else
564#ifdef __va_copy
565 __va_copy(count, vargs);
566#else
567 count = vargs;
568#endif
569#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000570 /* step 1: count the number of %S/%R format specifications
571 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
572 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000573 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000574 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000575 ++callcount;
576 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000577 /* step 2: allocate memory for the results of
578 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000579 if (callcount) {
580 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
581 if (!callresults) {
582 PyErr_NoMemory();
583 return NULL;
584 }
585 callresult = callresults;
586 }
587 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000588 for (f = format; *f; f++) {
589 if (*f == '%') {
590 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000591 width = 0;
592 while (isdigit(Py_CHARMASK(*f)))
593 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000594 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
595 ;
596
597 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
598 * they don't affect the amount of space we reserve.
599 */
600 if ((*f == 'l' || *f == 'z') &&
601 (f[1] == 'd' || f[1] == 'u'))
602 ++f;
603
604 switch (*f) {
605 case 'c':
606 (void)va_arg(count, int);
607 /* fall through... */
608 case '%':
609 n++;
610 break;
611 case 'd': case 'u': case 'i': case 'x':
612 (void) va_arg(count, int);
613 /* 20 bytes is enough to hold a 64-bit
614 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000615 This isn't enough for octal.
616 If a width is specified we need more
617 (which we allocate later). */
618 if (width < 20)
619 width = 20;
620 n += width;
621 if (abuffersize < width)
622 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000623 break;
624 case 's':
625 n += strlen(va_arg(count, char*));
626 break;
627 case 'U':
628 {
629 PyObject *obj = va_arg(count, PyObject *);
630 assert(obj && PyUnicode_Check(obj));
631 n += PyUnicode_GET_SIZE(obj);
632 break;
633 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000634 case 'V':
635 {
636 PyObject *obj = va_arg(count, PyObject *);
637 const char *str = va_arg(count, const char *);
638 assert(obj || str);
639 assert(!obj || PyUnicode_Check(obj));
640 if (obj)
641 n += PyUnicode_GET_SIZE(obj);
642 else
643 n += strlen(str);
644 break;
645 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000646 case 'S':
647 {
648 PyObject *obj = va_arg(count, PyObject *);
649 PyObject *str;
650 assert(obj);
651 str = PyObject_Unicode(obj);
652 if (!str)
653 goto fail;
654 n += PyUnicode_GET_SIZE(str);
655 /* Remember the str and switch to the next slot */
656 *callresult++ = str;
657 break;
658 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000659 case 'R':
660 {
661 PyObject *obj = va_arg(count, PyObject *);
662 PyObject *repr;
663 assert(obj);
664 repr = PyObject_Repr(obj);
665 if (!repr)
666 goto fail;
667 n += PyUnicode_GET_SIZE(repr);
668 /* Remember the repr and switch to the next slot */
669 *callresult++ = repr;
670 break;
671 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000672 case 'p':
673 (void) va_arg(count, int);
674 /* maximum 64-bit pointer representation:
675 * 0xffffffffffffffff
676 * so 19 characters is enough.
677 * XXX I count 18 -- what's the extra for?
678 */
679 n += 19;
680 break;
681 default:
682 /* if we stumble upon an unknown
683 formatting code, copy the rest of
684 the format string to the output
685 string. (we cannot just skip the
686 code, since there's no way to know
687 what's in the argument list) */
688 n += strlen(p);
689 goto expand;
690 }
691 } else
692 n++;
693 }
694 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000695 if (abuffersize > 20) {
696 abuffer = PyMem_Malloc(abuffersize);
697 if (!abuffer) {
698 PyErr_NoMemory();
699 goto fail;
700 }
701 realbuffer = abuffer;
702 }
703 else
704 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000705 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000706 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000707 we don't have to resize the string.
708 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000709 string = PyUnicode_FromUnicode(NULL, n);
710 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000711 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000712
713 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000714 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000715
716 for (f = format; *f; f++) {
717 if (*f == '%') {
718 const char* p = f++;
719 int longflag = 0;
720 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000721 zeropad = (*f == '0');
722 /* parse the width.precision part */
723 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000725 width = (width*10) + *f++ - '0';
726 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000727 if (*f == '.') {
728 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000729 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000730 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000731 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000732 /* handle the long flag, but only for %ld and %lu.
733 others can be added when necessary. */
734 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
735 longflag = 1;
736 ++f;
737 }
738 /* handle the size_t flag. */
739 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
740 size_tflag = 1;
741 ++f;
742 }
743
744 switch (*f) {
745 case 'c':
746 *s++ = va_arg(vargs, int);
747 break;
748 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000749 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000750 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000751 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000752 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000753 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000754 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000755 sprintf(realbuffer, fmt, va_arg(vargs, int));
756 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757 break;
758 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000759 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000760 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000761 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000763 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000764 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000765 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
766 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767 break;
768 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000769 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
770 sprintf(realbuffer, fmt, va_arg(vargs, int));
771 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000772 break;
773 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000774 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
775 sprintf(realbuffer, fmt, va_arg(vargs, int));
776 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000777 break;
778 case 's':
779 p = va_arg(vargs, char*);
780 appendstring(p);
781 break;
782 case 'U':
783 {
784 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000785 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
786 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
787 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000788 break;
789 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000790 case 'V':
791 {
792 PyObject *obj = va_arg(vargs, PyObject *);
793 const char *str = va_arg(vargs, const char *);
794 if (obj) {
795 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
796 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
797 s += size;
798 } else {
799 appendstring(str);
800 }
801 break;
802 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000803 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000804 case 'R':
805 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000806 Py_UNICODE *ucopy;
807 Py_ssize_t usize;
808 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000809 /* unused, since we already have the result */
810 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000811 ucopy = PyUnicode_AS_UNICODE(*callresult);
812 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000813 for (upos = 0; upos<usize;)
814 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000815 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000816 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000817 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000818 ++callresult;
819 break;
820 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000821 case 'p':
822 sprintf(buffer, "%p", va_arg(vargs, void*));
823 /* %p is ill-defined: ensure leading 0x. */
824 if (buffer[1] == 'X')
825 buffer[1] = 'x';
826 else if (buffer[1] != 'x') {
827 memmove(buffer+2, buffer, strlen(buffer)+1);
828 buffer[0] = '0';
829 buffer[1] = 'x';
830 }
831 appendstring(buffer);
832 break;
833 case '%':
834 *s++ = '%';
835 break;
836 default:
837 appendstring(p);
838 goto end;
839 }
840 } else
841 *s++ = *f;
842 }
843
844 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000845 if (callresults)
846 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000847 if (abuffer)
848 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000849 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
850 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000851 fail:
852 if (callresults) {
853 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000854 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000855 Py_DECREF(*callresult2);
856 ++callresult2;
857 }
858 PyMem_Free(callresults);
859 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000860 if (abuffer)
861 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000862 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000863}
864
865#undef appendstring
866
867PyObject *
868PyUnicode_FromFormat(const char *format, ...)
869{
870 PyObject* ret;
871 va_list vargs;
872
873#ifdef HAVE_STDARG_PROTOTYPES
874 va_start(vargs, format);
875#else
876 va_start(vargs);
877#endif
878 ret = PyUnicode_FromFormatV(format, vargs);
879 va_end(vargs);
880 return ret;
881}
882
Martin v. Löwis18e16552006-02-15 17:27:45 +0000883Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
884 wchar_t *w,
885 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000886{
887 if (unicode == NULL) {
888 PyErr_BadInternalCall();
889 return -1;
890 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000891
892 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000894 size = PyUnicode_GET_SIZE(unicode) + 1;
895
Guido van Rossumd57fd912000-03-10 22:53:23 +0000896#ifdef HAVE_USABLE_WCHAR_T
897 memcpy(w, unicode->str, size * sizeof(wchar_t));
898#else
899 {
900 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000901 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000902 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000903 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000904 *w++ = *u++;
905 }
906#endif
907
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000908 if (size > PyUnicode_GET_SIZE(unicode))
909 return PyUnicode_GET_SIZE(unicode);
910 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000911 return size;
912}
913
914#endif
915
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000916PyObject *PyUnicode_FromOrdinal(int ordinal)
917{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000918 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000919
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000920 if (ordinal < 0 || ordinal > 0x10ffff) {
921 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000922 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000923 return NULL;
924 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000925
926#ifndef Py_UNICODE_WIDE
927 if (ordinal > 0xffff) {
928 ordinal -= 0x10000;
929 s[0] = 0xD800 | (ordinal >> 10);
930 s[1] = 0xDC00 | (ordinal & 0x3FF);
931 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000932 }
933#endif
934
Hye-Shik Chang40574832004-04-06 07:24:51 +0000935 s[0] = (Py_UNICODE)ordinal;
936 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000937}
938
Guido van Rossumd57fd912000-03-10 22:53:23 +0000939PyObject *PyUnicode_FromObject(register PyObject *obj)
940{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000941 /* XXX Perhaps we should make this API an alias of
942 PyObject_Unicode() instead ?! */
943 if (PyUnicode_CheckExact(obj)) {
944 Py_INCREF(obj);
945 return obj;
946 }
947 if (PyUnicode_Check(obj)) {
948 /* For a Unicode subtype that's not a Unicode object,
949 return a true Unicode object with the same data. */
950 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
951 PyUnicode_GET_SIZE(obj));
952 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000953 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
954}
955
956PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
957 const char *encoding,
958 const char *errors)
959{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000960 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000961 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000962 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000963
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964 if (obj == NULL) {
965 PyErr_BadInternalCall();
966 return NULL;
967 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000968
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000969#if 0
970 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000971 that no encodings is given and then redirect to
972 PyObject_Unicode() which then applies the additional logic for
973 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000974
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000975 NOTE: This API should really only be used for object which
976 represent *encoded* Unicode !
977
978 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000979 if (PyUnicode_Check(obj)) {
980 if (encoding) {
981 PyErr_SetString(PyExc_TypeError,
982 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000983 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000984 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000985 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000986 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000987#else
988 if (PyUnicode_Check(obj)) {
989 PyErr_SetString(PyExc_TypeError,
990 "decoding Unicode is not supported");
991 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000992 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000993#endif
994
995 /* Coerce object */
996 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000997 s = PyString_AS_STRING(obj);
998 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000999 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001000 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1001 /* Overwrite the error message with something more useful in
1002 case of a TypeError. */
1003 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001004 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001005 "coercing to Unicode: need string or buffer, "
1006 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001007 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001008 goto onError;
1009 }
Tim Petersced69f82003-09-16 20:30:58 +00001010
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001011 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001012 if (len == 0) {
1013 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001014 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015 }
Tim Petersced69f82003-09-16 20:30:58 +00001016 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001017 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001018
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001019 return v;
1020
1021 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001022 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023}
1024
1025PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001026 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001027 const char *encoding,
1028 const char *errors)
1029{
1030 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001031
1032 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001033 encoding = PyUnicode_GetDefaultEncoding();
1034
1035 /* Shortcuts for common default encodings */
1036 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001037 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001038 else if (strcmp(encoding, "latin-1") == 0)
1039 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001040#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1041 else if (strcmp(encoding, "mbcs") == 0)
1042 return PyUnicode_DecodeMBCS(s, size, errors);
1043#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001044 else if (strcmp(encoding, "ascii") == 0)
1045 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046
1047 /* Decode via the codec registry */
1048 buffer = PyBuffer_FromMemory((void *)s, size);
1049 if (buffer == NULL)
1050 goto onError;
1051 unicode = PyCodec_Decode(buffer, encoding, errors);
1052 if (unicode == NULL)
1053 goto onError;
1054 if (!PyUnicode_Check(unicode)) {
1055 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001056 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001057 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001058 Py_DECREF(unicode);
1059 goto onError;
1060 }
1061 Py_DECREF(buffer);
1062 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001063
Guido van Rossumd57fd912000-03-10 22:53:23 +00001064 onError:
1065 Py_XDECREF(buffer);
1066 return NULL;
1067}
1068
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001069PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1070 const char *encoding,
1071 const char *errors)
1072{
1073 PyObject *v;
1074
1075 if (!PyUnicode_Check(unicode)) {
1076 PyErr_BadArgument();
1077 goto onError;
1078 }
1079
1080 if (encoding == NULL)
1081 encoding = PyUnicode_GetDefaultEncoding();
1082
1083 /* Decode via the codec registry */
1084 v = PyCodec_Decode(unicode, encoding, errors);
1085 if (v == NULL)
1086 goto onError;
1087 return v;
1088
1089 onError:
1090 return NULL;
1091}
1092
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001094 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 const char *encoding,
1096 const char *errors)
1097{
1098 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001099
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 unicode = PyUnicode_FromUnicode(s, size);
1101 if (unicode == NULL)
1102 return NULL;
1103 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1104 Py_DECREF(unicode);
1105 return v;
1106}
1107
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001108PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1109 const char *encoding,
1110 const char *errors)
1111{
1112 PyObject *v;
1113
1114 if (!PyUnicode_Check(unicode)) {
1115 PyErr_BadArgument();
1116 goto onError;
1117 }
1118
1119 if (encoding == NULL)
1120 encoding = PyUnicode_GetDefaultEncoding();
1121
1122 /* Encode via the codec registry */
1123 v = PyCodec_Encode(unicode, encoding, errors);
1124 if (v == NULL)
1125 goto onError;
1126 return v;
1127
1128 onError:
1129 return NULL;
1130}
1131
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1133 const char *encoding,
1134 const char *errors)
1135{
1136 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001137
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 if (!PyUnicode_Check(unicode)) {
1139 PyErr_BadArgument();
1140 goto onError;
1141 }
Fred Drakee4315f52000-05-09 19:53:39 +00001142
Tim Petersced69f82003-09-16 20:30:58 +00001143 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001144 encoding = PyUnicode_GetDefaultEncoding();
1145
1146 /* Shortcuts for common default encodings */
1147 if (errors == NULL) {
1148 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001149 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001150 else if (strcmp(encoding, "latin-1") == 0)
1151 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001152#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1153 else if (strcmp(encoding, "mbcs") == 0)
1154 return PyUnicode_AsMBCSString(unicode);
1155#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001156 else if (strcmp(encoding, "ascii") == 0)
1157 return PyUnicode_AsASCIIString(unicode);
1158 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159
1160 /* Encode via the codec registry */
1161 v = PyCodec_Encode(unicode, encoding, errors);
1162 if (v == NULL)
1163 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001164 if (!PyBytes_Check(v)) {
1165 if (PyString_Check(v)) {
1166 /* Old codec, turn it into bytes */
1167 PyObject *b = PyBytes_FromObject(v);
1168 Py_DECREF(v);
1169 return b;
1170 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001172 "encoder did not return a bytes object "
1173 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1174 v->ob_type->tp_name,
1175 encoding ? encoding : "NULL",
1176 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 Py_DECREF(v);
1178 goto onError;
1179 }
1180 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001181
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182 onError:
1183 return NULL;
1184}
1185
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001186PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1187 const char *errors)
1188{
1189 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001190 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001191 if (v)
1192 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001193 if (errors != NULL)
1194 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1195 if (errors == NULL) {
1196 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1197 PyUnicode_GET_SIZE(unicode),
1198 NULL);
1199 }
1200 else {
1201 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1202 }
1203 if (!b)
1204 return NULL;
1205 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1206 PyBytes_Size(b));
1207 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001208 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001209 return v;
1210}
1211
Martin v. Löwis5b222132007-06-10 09:51:05 +00001212char*
1213PyUnicode_AsString(PyObject *unicode)
1214{
1215 assert(PyUnicode_Check(unicode));
1216 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1217 if (!unicode)
1218 return NULL;
1219 return PyString_AsString(unicode);
1220}
1221
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1223{
1224 if (!PyUnicode_Check(unicode)) {
1225 PyErr_BadArgument();
1226 goto onError;
1227 }
1228 return PyUnicode_AS_UNICODE(unicode);
1229
1230 onError:
1231 return NULL;
1232}
1233
Martin v. Löwis18e16552006-02-15 17:27:45 +00001234Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235{
1236 if (!PyUnicode_Check(unicode)) {
1237 PyErr_BadArgument();
1238 goto onError;
1239 }
1240 return PyUnicode_GET_SIZE(unicode);
1241
1242 onError:
1243 return -1;
1244}
1245
Thomas Wouters78890102000-07-22 19:25:51 +00001246const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001247{
1248 return unicode_default_encoding;
1249}
1250
1251int PyUnicode_SetDefaultEncoding(const char *encoding)
1252{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001253 if (strcmp(encoding, unicode_default_encoding) != 0) {
1254 PyErr_Format(PyExc_ValueError,
1255 "Can only set default encoding to %s",
1256 unicode_default_encoding);
1257 return -1;
1258 }
Fred Drakee4315f52000-05-09 19:53:39 +00001259 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001260}
1261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001262/* error handling callback helper:
1263 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001264 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001265 and adjust various state variables.
1266 return 0 on success, -1 on error
1267*/
1268
1269static
1270int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1271 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001272 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1273 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001274{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001275 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001276
1277 PyObject *restuple = NULL;
1278 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001279 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1280 Py_ssize_t requiredsize;
1281 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001282 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001283 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001284 int res = -1;
1285
1286 if (*errorHandler == NULL) {
1287 *errorHandler = PyCodec_LookupError(errors);
1288 if (*errorHandler == NULL)
1289 goto onError;
1290 }
1291
1292 if (*exceptionObject == NULL) {
1293 *exceptionObject = PyUnicodeDecodeError_Create(
1294 encoding, input, insize, *startinpos, *endinpos, reason);
1295 if (*exceptionObject == NULL)
1296 goto onError;
1297 }
1298 else {
1299 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1300 goto onError;
1301 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1302 goto onError;
1303 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1304 goto onError;
1305 }
1306
1307 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1308 if (restuple == NULL)
1309 goto onError;
1310 if (!PyTuple_Check(restuple)) {
1311 PyErr_Format(PyExc_TypeError, &argparse[4]);
1312 goto onError;
1313 }
1314 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1315 goto onError;
1316 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001317 newpos = insize+newpos;
1318 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001319 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001320 goto onError;
1321 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001322
1323 /* need more space? (at least enough for what we
1324 have+the replacement+the rest of the string (starting
1325 at the new input position), so we won't have to check space
1326 when there are no errors in the rest of the string) */
1327 repptr = PyUnicode_AS_UNICODE(repunicode);
1328 repsize = PyUnicode_GET_SIZE(repunicode);
1329 requiredsize = *outpos + repsize + insize-newpos;
1330 if (requiredsize > outsize) {
1331 if (requiredsize<2*outsize)
1332 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001333 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001334 goto onError;
1335 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1336 }
1337 *endinpos = newpos;
1338 *inptr = input + newpos;
1339 Py_UNICODE_COPY(*outptr, repptr, repsize);
1340 *outptr += repsize;
1341 *outpos += repsize;
1342 /* we made it! */
1343 res = 0;
1344
1345 onError:
1346 Py_XDECREF(restuple);
1347 return res;
1348}
1349
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001350/* --- UTF-7 Codec -------------------------------------------------------- */
1351
1352/* see RFC2152 for details */
1353
Tim Petersced69f82003-09-16 20:30:58 +00001354static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001355char utf7_special[128] = {
1356 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1357 encoded:
1358 0 - not special
1359 1 - special
1360 2 - whitespace (optional)
1361 3 - RFC2152 Set O (optional) */
1362 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1363 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1364 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1365 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1366 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1367 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1368 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1369 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1370
1371};
1372
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001373/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1374 warnings about the comparison always being false; since
1375 utf7_special[0] is 1, we can safely make that one comparison
1376 true */
1377
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001378#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001379 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001380 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001381 (encodeO && (utf7_special[(c)] == 3)))
1382
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001383#define B64(n) \
1384 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1385#define B64CHAR(c) \
1386 (isalnum(c) || (c) == '+' || (c) == '/')
1387#define UB64(c) \
1388 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1389 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001390
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001391#define ENCODE(out, ch, bits) \
1392 while (bits >= 6) { \
1393 *out++ = B64(ch >> (bits-6)); \
1394 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001395 }
1396
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001397#define DECODE(out, ch, bits, surrogate) \
1398 while (bits >= 16) { \
1399 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1400 bits -= 16; \
1401 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001402 /* We have already generated an error for the high surrogate \
1403 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001404 surrogate = 0; \
1405 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001406 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001407 it in a 16-bit character */ \
1408 surrogate = 1; \
1409 errmsg = "code pairs are not supported"; \
1410 goto utf7Error; \
1411 } else { \
1412 *out++ = outCh; \
1413 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001414 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001415
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001416PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001417 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001418 const char *errors)
1419{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001421 Py_ssize_t startinpos;
1422 Py_ssize_t endinpos;
1423 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001424 const char *e;
1425 PyUnicodeObject *unicode;
1426 Py_UNICODE *p;
1427 const char *errmsg = "";
1428 int inShift = 0;
1429 unsigned int bitsleft = 0;
1430 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001431 int surrogate = 0;
1432 PyObject *errorHandler = NULL;
1433 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001434
1435 unicode = _PyUnicode_New(size);
1436 if (!unicode)
1437 return NULL;
1438 if (size == 0)
1439 return (PyObject *)unicode;
1440
1441 p = unicode->str;
1442 e = s + size;
1443
1444 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445 Py_UNICODE ch;
1446 restart:
1447 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001448
1449 if (inShift) {
1450 if ((ch == '-') || !B64CHAR(ch)) {
1451 inShift = 0;
1452 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001453
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001454 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1455 if (bitsleft >= 6) {
1456 /* The shift sequence has a partial character in it. If
1457 bitsleft < 6 then we could just classify it as padding
1458 but that is not the case here */
1459
1460 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001461 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001462 }
1463 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001464 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001465 here so indicate the potential of a misencoded character. */
1466
1467 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1468 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1469 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001470 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001471 }
1472
1473 if (ch == '-') {
1474 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001475 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001476 inShift = 1;
1477 }
1478 } else if (SPECIAL(ch,0,0)) {
1479 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001480 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001481 } else {
1482 *p++ = ch;
1483 }
1484 } else {
1485 charsleft = (charsleft << 6) | UB64(ch);
1486 bitsleft += 6;
1487 s++;
1488 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1489 }
1490 }
1491 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001492 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001493 s++;
1494 if (s < e && *s == '-') {
1495 s++;
1496 *p++ = '+';
1497 } else
1498 {
1499 inShift = 1;
1500 bitsleft = 0;
1501 }
1502 }
1503 else if (SPECIAL(ch,0,0)) {
1504 errmsg = "unexpected special character";
1505 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001506 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001507 }
1508 else {
1509 *p++ = ch;
1510 s++;
1511 }
1512 continue;
1513 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001514 outpos = p-PyUnicode_AS_UNICODE(unicode);
1515 endinpos = s-starts;
1516 if (unicode_decode_call_errorhandler(
1517 errors, &errorHandler,
1518 "utf7", errmsg,
1519 starts, size, &startinpos, &endinpos, &exc, &s,
1520 (PyObject **)&unicode, &outpos, &p))
1521 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001522 }
1523
1524 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 outpos = p-PyUnicode_AS_UNICODE(unicode);
1526 endinpos = size;
1527 if (unicode_decode_call_errorhandler(
1528 errors, &errorHandler,
1529 "utf7", "unterminated shift sequence",
1530 starts, size, &startinpos, &endinpos, &exc, &s,
1531 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001532 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001533 if (s < e)
1534 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001535 }
1536
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001537 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001538 goto onError;
1539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 Py_XDECREF(errorHandler);
1541 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001542 return (PyObject *)unicode;
1543
1544onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001545 Py_XDECREF(errorHandler);
1546 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001547 Py_DECREF(unicode);
1548 return NULL;
1549}
1550
1551
1552PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001553 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001554 int encodeSetO,
1555 int encodeWhiteSpace,
1556 const char *errors)
1557{
1558 PyObject *v;
1559 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001560 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001561 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001562 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563 unsigned int bitsleft = 0;
1564 unsigned long charsleft = 0;
1565 char * out;
1566 char * start;
1567
1568 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001569 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001570
Walter Dörwald51ab4142007-05-05 14:43:36 +00001571 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001572 if (v == NULL)
1573 return NULL;
1574
Walter Dörwald51ab4142007-05-05 14:43:36 +00001575 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001576 for (;i < size; ++i) {
1577 Py_UNICODE ch = s[i];
1578
1579 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001580 if (ch == '+') {
1581 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001582 *out++ = '-';
1583 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1584 charsleft = ch;
1585 bitsleft = 16;
1586 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001587 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001588 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001589 } else {
1590 *out++ = (char) ch;
1591 }
1592 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001593 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1594 *out++ = B64(charsleft << (6-bitsleft));
1595 charsleft = 0;
1596 bitsleft = 0;
1597 /* Characters not in the BASE64 set implicitly unshift the sequence
1598 so no '-' is required, except if the character is itself a '-' */
1599 if (B64CHAR(ch) || ch == '-') {
1600 *out++ = '-';
1601 }
1602 inShift = 0;
1603 *out++ = (char) ch;
1604 } else {
1605 bitsleft += 16;
1606 charsleft = (charsleft << 16) | ch;
1607 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1608
1609 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001610 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001611 or '-' then the shift sequence will be terminated implicitly and we
1612 don't have to insert a '-'. */
1613
1614 if (bitsleft == 0) {
1615 if (i + 1 < size) {
1616 Py_UNICODE ch2 = s[i+1];
1617
1618 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001619
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001620 } else if (B64CHAR(ch2) || ch2 == '-') {
1621 *out++ = '-';
1622 inShift = 0;
1623 } else {
1624 inShift = 0;
1625 }
1626
1627 }
1628 else {
1629 *out++ = '-';
1630 inShift = 0;
1631 }
1632 }
Tim Petersced69f82003-09-16 20:30:58 +00001633 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001634 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001635 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001636 if (bitsleft) {
1637 *out++= B64(charsleft << (6-bitsleft) );
1638 *out++ = '-';
1639 }
1640
Walter Dörwald51ab4142007-05-05 14:43:36 +00001641 if (PyBytes_Resize(v, out - start)) {
1642 Py_DECREF(v);
1643 return NULL;
1644 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001645 return v;
1646}
1647
1648#undef SPECIAL
1649#undef B64
1650#undef B64CHAR
1651#undef UB64
1652#undef ENCODE
1653#undef DECODE
1654
Guido van Rossumd57fd912000-03-10 22:53:23 +00001655/* --- UTF-8 Codec -------------------------------------------------------- */
1656
Tim Petersced69f82003-09-16 20:30:58 +00001657static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001658char utf8_code_length[256] = {
1659 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1660 illegal prefix. see RFC 2279 for details */
1661 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1662 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1663 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1664 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1665 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1666 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1667 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1668 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1671 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1672 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1673 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1674 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1675 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1676 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1677};
1678
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001680 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 const char *errors)
1682{
Walter Dörwald69652032004-09-07 20:24:22 +00001683 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1684}
1685
1686PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001687 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001688 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001689 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001690{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001691 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001693 Py_ssize_t startinpos;
1694 Py_ssize_t endinpos;
1695 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696 const char *e;
1697 PyUnicodeObject *unicode;
1698 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001699 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001700 PyObject *errorHandler = NULL;
1701 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702
1703 /* Note: size will always be longer than the resulting Unicode
1704 character count */
1705 unicode = _PyUnicode_New(size);
1706 if (!unicode)
1707 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001708 if (size == 0) {
1709 if (consumed)
1710 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001711 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713
1714 /* Unpack UTF-8 encoded data */
1715 p = unicode->str;
1716 e = s + size;
1717
1718 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001719 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720
1721 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001722 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723 s++;
1724 continue;
1725 }
1726
1727 n = utf8_code_length[ch];
1728
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001729 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001730 if (consumed)
1731 break;
1732 else {
1733 errmsg = "unexpected end of data";
1734 startinpos = s-starts;
1735 endinpos = size;
1736 goto utf8Error;
1737 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001738 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739
1740 switch (n) {
1741
1742 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001743 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001744 startinpos = s-starts;
1745 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001746 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747
1748 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001749 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001750 startinpos = s-starts;
1751 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001752 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753
1754 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001755 if ((s[1] & 0xc0) != 0x80) {
1756 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001757 startinpos = s-starts;
1758 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001759 goto utf8Error;
1760 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001762 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001763 startinpos = s-starts;
1764 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001765 errmsg = "illegal encoding";
1766 goto utf8Error;
1767 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001769 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770 break;
1771
1772 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001773 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001774 (s[2] & 0xc0) != 0x80) {
1775 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001776 startinpos = s-starts;
1777 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001778 goto utf8Error;
1779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001781 if (ch < 0x0800) {
1782 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001783 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001784
1785 XXX For wide builds (UCS-4) we should probably try
1786 to recombine the surrogates into a single code
1787 unit.
1788 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001789 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 startinpos = s-starts;
1791 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001792 goto utf8Error;
1793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001795 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001796 break;
1797
1798 case 4:
1799 if ((s[1] & 0xc0) != 0x80 ||
1800 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001801 (s[3] & 0xc0) != 0x80) {
1802 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 startinpos = s-starts;
1804 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001805 goto utf8Error;
1806 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001807 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1808 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1809 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001810 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001811 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001812 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001813 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001814 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001815 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001816 startinpos = s-starts;
1817 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001818 goto utf8Error;
1819 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001820#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001821 *p++ = (Py_UNICODE)ch;
1822#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001823 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001824
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001825 /* translate from 10000..10FFFF to 0..FFFF */
1826 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001827
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001828 /* high surrogate = top 10 bits added to D800 */
1829 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001830
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001831 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001832 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001833#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001834 break;
1835
1836 default:
1837 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001838 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001839 startinpos = s-starts;
1840 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001841 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842 }
1843 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001844 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001845
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001846 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001847 outpos = p-PyUnicode_AS_UNICODE(unicode);
1848 if (unicode_decode_call_errorhandler(
1849 errors, &errorHandler,
1850 "utf8", errmsg,
1851 starts, size, &startinpos, &endinpos, &exc, &s,
1852 (PyObject **)&unicode, &outpos, &p))
1853 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854 }
Walter Dörwald69652032004-09-07 20:24:22 +00001855 if (consumed)
1856 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857
1858 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001859 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860 goto onError;
1861
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 Py_XDECREF(errorHandler);
1863 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864 return (PyObject *)unicode;
1865
1866onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 Py_XDECREF(errorHandler);
1868 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001869 Py_DECREF(unicode);
1870 return NULL;
1871}
1872
Tim Peters602f7402002-04-27 18:03:26 +00001873/* Allocation strategy: if the string is short, convert into a stack buffer
1874 and allocate exactly as much space needed at the end. Else allocate the
1875 maximum possible needed (4 result bytes per Unicode character), and return
1876 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001877*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001878PyObject *
1879PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001880 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001881 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001882{
Tim Peters602f7402002-04-27 18:03:26 +00001883#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001884
Martin v. Löwis18e16552006-02-15 17:27:45 +00001885 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001886 PyObject *v; /* result string object */
1887 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001888 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001889 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001890 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001891
Tim Peters602f7402002-04-27 18:03:26 +00001892 assert(s != NULL);
1893 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001894
Tim Peters602f7402002-04-27 18:03:26 +00001895 if (size <= MAX_SHORT_UNICHARS) {
1896 /* Write into the stack buffer; nallocated can't overflow.
1897 * At the end, we'll allocate exactly as much heap space as it
1898 * turns out we need.
1899 */
1900 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1901 v = NULL; /* will allocate after we're done */
1902 p = stackbuf;
1903 }
1904 else {
1905 /* Overallocate on the heap, and give the excess back at the end. */
1906 nallocated = size * 4;
1907 if (nallocated / 4 != size) /* overflow! */
1908 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001909 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001910 if (v == NULL)
1911 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001912 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001913 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001914
Tim Peters602f7402002-04-27 18:03:26 +00001915 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001916 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001917
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001918 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001919 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001921
Guido van Rossumd57fd912000-03-10 22:53:23 +00001922 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001923 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001924 *p++ = (char)(0xc0 | (ch >> 6));
1925 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001926 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001927 else {
Tim Peters602f7402002-04-27 18:03:26 +00001928 /* Encode UCS2 Unicode ordinals */
1929 if (ch < 0x10000) {
1930 /* Special case: check for high surrogate */
1931 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1932 Py_UCS4 ch2 = s[i];
1933 /* Check for low surrogate and combine the two to
1934 form a UCS4 value */
1935 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001936 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001937 i++;
1938 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001939 }
Tim Peters602f7402002-04-27 18:03:26 +00001940 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001941 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001942 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001943 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1944 *p++ = (char)(0x80 | (ch & 0x3f));
1945 continue;
1946 }
1947encodeUCS4:
1948 /* Encode UCS4 Unicode ordinals */
1949 *p++ = (char)(0xf0 | (ch >> 18));
1950 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1951 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1952 *p++ = (char)(0x80 | (ch & 0x3f));
1953 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001955
Tim Peters602f7402002-04-27 18:03:26 +00001956 if (v == NULL) {
1957 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001958 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001959 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001960 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001961 }
1962 else {
1963 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001964 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001965 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001966 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001969
Tim Peters602f7402002-04-27 18:03:26 +00001970#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971}
1972
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1974{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975 if (!PyUnicode_Check(unicode)) {
1976 PyErr_BadArgument();
1977 return NULL;
1978 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001979 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1980 PyUnicode_GET_SIZE(unicode),
1981 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982}
1983
1984/* --- UTF-16 Codec ------------------------------------------------------- */
1985
Tim Peters772747b2001-08-09 22:21:55 +00001986PyObject *
1987PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001988 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001989 const char *errors,
1990 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001991{
Walter Dörwald69652032004-09-07 20:24:22 +00001992 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1993}
1994
1995PyObject *
1996PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001997 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001998 const char *errors,
1999 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002000 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002001{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002002 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002003 Py_ssize_t startinpos;
2004 Py_ssize_t endinpos;
2005 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006 PyUnicodeObject *unicode;
2007 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002008 const unsigned char *q, *e;
2009 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002010 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002011 /* Offsets from q for retrieving byte pairs in the right order. */
2012#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2013 int ihi = 1, ilo = 0;
2014#else
2015 int ihi = 0, ilo = 1;
2016#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002017 PyObject *errorHandler = NULL;
2018 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019
2020 /* Note: size will always be longer than the resulting Unicode
2021 character count */
2022 unicode = _PyUnicode_New(size);
2023 if (!unicode)
2024 return NULL;
2025 if (size == 0)
2026 return (PyObject *)unicode;
2027
2028 /* Unpack UTF-16 encoded data */
2029 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002030 q = (unsigned char *)s;
2031 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032
2033 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002034 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002036 /* Check for BOM marks (U+FEFF) in the input and adjust current
2037 byte order setting accordingly. In native mode, the leading BOM
2038 mark is skipped, in all other modes, it is copied to the output
2039 stream as-is (giving a ZWNBSP character). */
2040 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002041 if (size >= 2) {
2042 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002043#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002044 if (bom == 0xFEFF) {
2045 q += 2;
2046 bo = -1;
2047 }
2048 else if (bom == 0xFFFE) {
2049 q += 2;
2050 bo = 1;
2051 }
Tim Petersced69f82003-09-16 20:30:58 +00002052#else
Walter Dörwald69652032004-09-07 20:24:22 +00002053 if (bom == 0xFEFF) {
2054 q += 2;
2055 bo = 1;
2056 }
2057 else if (bom == 0xFFFE) {
2058 q += 2;
2059 bo = -1;
2060 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002061#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002062 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064
Tim Peters772747b2001-08-09 22:21:55 +00002065 if (bo == -1) {
2066 /* force LE */
2067 ihi = 1;
2068 ilo = 0;
2069 }
2070 else if (bo == 1) {
2071 /* force BE */
2072 ihi = 0;
2073 ilo = 1;
2074 }
2075
2076 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002077 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002078 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002079 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002080 if (consumed)
2081 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002082 errmsg = "truncated data";
2083 startinpos = ((const char *)q)-starts;
2084 endinpos = ((const char *)e)-starts;
2085 goto utf16Error;
2086 /* The remaining input chars are ignored if the callback
2087 chooses to skip the input */
2088 }
2089 ch = (q[ihi] << 8) | q[ilo];
2090
Tim Peters772747b2001-08-09 22:21:55 +00002091 q += 2;
2092
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093 if (ch < 0xD800 || ch > 0xDFFF) {
2094 *p++ = ch;
2095 continue;
2096 }
2097
2098 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002099 if (q >= e) {
2100 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002101 startinpos = (((const char *)q)-2)-starts;
2102 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002103 goto utf16Error;
2104 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002105 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002106 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2107 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002108 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002109#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002110 *p++ = ch;
2111 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002112#else
2113 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002114#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002115 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002116 }
2117 else {
2118 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002119 startinpos = (((const char *)q)-4)-starts;
2120 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002121 goto utf16Error;
2122 }
2123
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002125 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002126 startinpos = (((const char *)q)-2)-starts;
2127 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002128 /* Fall through to report the error */
2129
2130 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002131 outpos = p-PyUnicode_AS_UNICODE(unicode);
2132 if (unicode_decode_call_errorhandler(
2133 errors, &errorHandler,
2134 "utf16", errmsg,
2135 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2136 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002137 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138 }
2139
2140 if (byteorder)
2141 *byteorder = bo;
2142
Walter Dörwald69652032004-09-07 20:24:22 +00002143 if (consumed)
2144 *consumed = (const char *)q-starts;
2145
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002147 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 goto onError;
2149
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002150 Py_XDECREF(errorHandler);
2151 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152 return (PyObject *)unicode;
2153
2154onError:
2155 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002156 Py_XDECREF(errorHandler);
2157 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002158 return NULL;
2159}
2160
Tim Peters772747b2001-08-09 22:21:55 +00002161PyObject *
2162PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002163 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002164 const char *errors,
2165 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002166{
2167 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002168 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002169#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002170 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002171#else
2172 const int pairs = 0;
2173#endif
Tim Peters772747b2001-08-09 22:21:55 +00002174 /* Offsets from p for storing byte pairs in the right order. */
2175#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2176 int ihi = 1, ilo = 0;
2177#else
2178 int ihi = 0, ilo = 1;
2179#endif
2180
2181#define STORECHAR(CH) \
2182 do { \
2183 p[ihi] = ((CH) >> 8) & 0xff; \
2184 p[ilo] = (CH) & 0xff; \
2185 p += 2; \
2186 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002187
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002188#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002189 for (i = pairs = 0; i < size; i++)
2190 if (s[i] >= 0x10000)
2191 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002192#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002193 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002194 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195 if (v == NULL)
2196 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197
Walter Dörwald3cc34522007-05-04 10:48:27 +00002198 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002200 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002201 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002202 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002203
2204 if (byteorder == -1) {
2205 /* force LE */
2206 ihi = 1;
2207 ilo = 0;
2208 }
2209 else if (byteorder == 1) {
2210 /* force BE */
2211 ihi = 0;
2212 ilo = 1;
2213 }
2214
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002215 while (size-- > 0) {
2216 Py_UNICODE ch = *s++;
2217 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002218#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002219 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002220 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2221 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002223#endif
Tim Peters772747b2001-08-09 22:21:55 +00002224 STORECHAR(ch);
2225 if (ch2)
2226 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002227 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002229#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230}
2231
2232PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2233{
2234 if (!PyUnicode_Check(unicode)) {
2235 PyErr_BadArgument();
2236 return NULL;
2237 }
2238 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2239 PyUnicode_GET_SIZE(unicode),
2240 NULL,
2241 0);
2242}
2243
2244/* --- Unicode Escape Codec ----------------------------------------------- */
2245
Fredrik Lundh06d12682001-01-24 07:59:11 +00002246static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002247
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002249 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250 const char *errors)
2251{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002252 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002253 Py_ssize_t startinpos;
2254 Py_ssize_t endinpos;
2255 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002256 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002257 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002258 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002259 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002260 char* message;
2261 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002262 PyObject *errorHandler = NULL;
2263 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002264
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265 /* Escaped strings will always be longer than the resulting
2266 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002267 length after conversion to the true value.
2268 (but if the error callback returns a long replacement string
2269 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002270 v = _PyUnicode_New(size);
2271 if (v == NULL)
2272 goto onError;
2273 if (size == 0)
2274 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002275
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002276 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002277 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002278
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279 while (s < end) {
2280 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002281 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002282 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283
2284 /* Non-escape characters are interpreted as Unicode ordinals */
2285 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002286 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002287 continue;
2288 }
2289
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002290 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 /* \ - Escapes */
2292 s++;
2293 switch (*s++) {
2294
2295 /* \x escapes */
2296 case '\n': break;
2297 case '\\': *p++ = '\\'; break;
2298 case '\'': *p++ = '\''; break;
2299 case '\"': *p++ = '\"'; break;
2300 case 'b': *p++ = '\b'; break;
2301 case 'f': *p++ = '\014'; break; /* FF */
2302 case 't': *p++ = '\t'; break;
2303 case 'n': *p++ = '\n'; break;
2304 case 'r': *p++ = '\r'; break;
2305 case 'v': *p++ = '\013'; break; /* VT */
2306 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2307
2308 /* \OOO (octal) escapes */
2309 case '0': case '1': case '2': case '3':
2310 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002311 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002313 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002314 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002315 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002316 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002317 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002318 break;
2319
Fredrik Lundhccc74732001-02-18 22:13:49 +00002320 /* hex escapes */
2321 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002323 digits = 2;
2324 message = "truncated \\xXX escape";
2325 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002326
Fredrik Lundhccc74732001-02-18 22:13:49 +00002327 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002328 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002329 digits = 4;
2330 message = "truncated \\uXXXX escape";
2331 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002332
Fredrik Lundhccc74732001-02-18 22:13:49 +00002333 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002334 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002335 digits = 8;
2336 message = "truncated \\UXXXXXXXX escape";
2337 hexescape:
2338 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002339 outpos = p-PyUnicode_AS_UNICODE(v);
2340 if (s+digits>end) {
2341 endinpos = size;
2342 if (unicode_decode_call_errorhandler(
2343 errors, &errorHandler,
2344 "unicodeescape", "end of string in escape sequence",
2345 starts, size, &startinpos, &endinpos, &exc, &s,
2346 (PyObject **)&v, &outpos, &p))
2347 goto onError;
2348 goto nextByte;
2349 }
2350 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002351 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002352 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002353 endinpos = (s+i+1)-starts;
2354 if (unicode_decode_call_errorhandler(
2355 errors, &errorHandler,
2356 "unicodeescape", message,
2357 starts, size, &startinpos, &endinpos, &exc, &s,
2358 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002359 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002360 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002361 }
2362 chr = (chr<<4) & ~0xF;
2363 if (c >= '0' && c <= '9')
2364 chr += c - '0';
2365 else if (c >= 'a' && c <= 'f')
2366 chr += 10 + c - 'a';
2367 else
2368 chr += 10 + c - 'A';
2369 }
2370 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002371 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002372 /* _decoding_error will have already written into the
2373 target buffer. */
2374 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002375 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002376 /* when we get here, chr is a 32-bit unicode character */
2377 if (chr <= 0xffff)
2378 /* UCS-2 character */
2379 *p++ = (Py_UNICODE) chr;
2380 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002381 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002382 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002383#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002384 *p++ = chr;
2385#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002386 chr -= 0x10000L;
2387 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002388 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002389#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002390 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002391 endinpos = s-starts;
2392 outpos = p-PyUnicode_AS_UNICODE(v);
2393 if (unicode_decode_call_errorhandler(
2394 errors, &errorHandler,
2395 "unicodeescape", "illegal Unicode character",
2396 starts, size, &startinpos, &endinpos, &exc, &s,
2397 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002398 goto onError;
2399 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002400 break;
2401
2402 /* \N{name} */
2403 case 'N':
2404 message = "malformed \\N character escape";
2405 if (ucnhash_CAPI == NULL) {
2406 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002407 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002408 m = PyImport_ImportModule("unicodedata");
2409 if (m == NULL)
2410 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002411 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002412 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002413 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002414 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002415 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002416 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002417 if (ucnhash_CAPI == NULL)
2418 goto ucnhashError;
2419 }
2420 if (*s == '{') {
2421 const char *start = s+1;
2422 /* look for the closing brace */
2423 while (*s != '}' && s < end)
2424 s++;
2425 if (s > start && s < end && *s == '}') {
2426 /* found a name. look it up in the unicode database */
2427 message = "unknown Unicode character name";
2428 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002429 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002430 goto store;
2431 }
2432 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002433 endinpos = s-starts;
2434 outpos = p-PyUnicode_AS_UNICODE(v);
2435 if (unicode_decode_call_errorhandler(
2436 errors, &errorHandler,
2437 "unicodeescape", message,
2438 starts, size, &startinpos, &endinpos, &exc, &s,
2439 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002440 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002441 break;
2442
2443 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002444 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002445 message = "\\ at end of string";
2446 s--;
2447 endinpos = s-starts;
2448 outpos = p-PyUnicode_AS_UNICODE(v);
2449 if (unicode_decode_call_errorhandler(
2450 errors, &errorHandler,
2451 "unicodeescape", message,
2452 starts, size, &startinpos, &endinpos, &exc, &s,
2453 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002454 goto onError;
2455 }
2456 else {
2457 *p++ = '\\';
2458 *p++ = (unsigned char)s[-1];
2459 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002460 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002461 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002462 nextByte:
2463 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002464 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002465 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002466 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002467 Py_XDECREF(errorHandler);
2468 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002469 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002470
Fredrik Lundhccc74732001-02-18 22:13:49 +00002471ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002472 PyErr_SetString(
2473 PyExc_UnicodeError,
2474 "\\N escapes not supported (can't load unicodedata module)"
2475 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002476 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002477 Py_XDECREF(errorHandler);
2478 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002479 return NULL;
2480
Fredrik Lundhccc74732001-02-18 22:13:49 +00002481onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002483 Py_XDECREF(errorHandler);
2484 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485 return NULL;
2486}
2487
2488/* Return a Unicode-Escape string version of the Unicode object.
2489
2490 If quotes is true, the string is enclosed in u"" or u'' quotes as
2491 appropriate.
2492
2493*/
2494
Thomas Wouters477c8d52006-05-27 19:21:47 +00002495Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2496 Py_ssize_t size,
2497 Py_UNICODE ch)
2498{
2499 /* like wcschr, but doesn't stop at NULL characters */
2500
2501 while (size-- > 0) {
2502 if (*s == ch)
2503 return s;
2504 s++;
2505 }
2506
2507 return NULL;
2508}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002509
Walter Dörwald79e913e2007-05-12 11:08:06 +00002510static const char *hexdigits = "0123456789abcdef";
2511
2512PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2513 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002514{
2515 PyObject *repr;
2516 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002517
Thomas Wouters89f507f2006-12-13 04:49:30 +00002518 /* XXX(nnorwitz): rather than over-allocating, it would be
2519 better to choose a different scheme. Perhaps scan the
2520 first N-chars of the string and allocate based on that size.
2521 */
2522 /* Initial allocation is based on the longest-possible unichr
2523 escape.
2524
2525 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2526 unichr, so in this case it's the longest unichr escape. In
2527 narrow (UTF-16) builds this is five chars per source unichr
2528 since there are two unichrs in the surrogate pair, so in narrow
2529 (UTF-16) builds it's not the longest unichr escape.
2530
2531 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2532 so in the narrow (UTF-16) build case it's the longest unichr
2533 escape.
2534 */
2535
Walter Dörwald79e913e2007-05-12 11:08:06 +00002536 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002537#ifdef Py_UNICODE_WIDE
2538 + 10*size
2539#else
2540 + 6*size
2541#endif
2542 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543 if (repr == NULL)
2544 return NULL;
2545
Walter Dörwald79e913e2007-05-12 11:08:06 +00002546 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547
Guido van Rossumd57fd912000-03-10 22:53:23 +00002548 while (size-- > 0) {
2549 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002550
Walter Dörwald79e913e2007-05-12 11:08:06 +00002551 /* Escape backslashes */
2552 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553 *p++ = '\\';
2554 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002555 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002556 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002557
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002558#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002559 /* Map 21-bit characters to '\U00xxxxxx' */
2560 else if (ch >= 0x10000) {
2561 *p++ = '\\';
2562 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002563 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2564 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2565 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2566 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2567 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2568 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2569 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2570 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002571 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002572 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002573#else
2574 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002575 else if (ch >= 0xD800 && ch < 0xDC00) {
2576 Py_UNICODE ch2;
2577 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002578
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002579 ch2 = *s++;
2580 size--;
2581 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2582 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2583 *p++ = '\\';
2584 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002585 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2586 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2587 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2588 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2589 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2590 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2591 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2592 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002593 continue;
2594 }
2595 /* Fall through: isolated surrogates are copied as-is */
2596 s--;
2597 size++;
2598 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002599#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002600
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002602 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 *p++ = '\\';
2604 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002605 *p++ = hexdigits[(ch >> 12) & 0x000F];
2606 *p++ = hexdigits[(ch >> 8) & 0x000F];
2607 *p++ = hexdigits[(ch >> 4) & 0x000F];
2608 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002610
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002611 /* Map special whitespace to '\t', \n', '\r' */
2612 else if (ch == '\t') {
2613 *p++ = '\\';
2614 *p++ = 't';
2615 }
2616 else if (ch == '\n') {
2617 *p++ = '\\';
2618 *p++ = 'n';
2619 }
2620 else if (ch == '\r') {
2621 *p++ = '\\';
2622 *p++ = 'r';
2623 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002624
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002625 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002626 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002627 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002628 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002629 *p++ = hexdigits[(ch >> 4) & 0x000F];
2630 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002631 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002632
Guido van Rossumd57fd912000-03-10 22:53:23 +00002633 /* Copy everything else as-is */
2634 else
2635 *p++ = (char) ch;
2636 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002637
2638 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002639 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2640 Py_DECREF(repr);
2641 return NULL;
2642 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643 return repr;
2644}
2645
Guido van Rossumd57fd912000-03-10 22:53:23 +00002646PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2647{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002648 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649 if (!PyUnicode_Check(unicode)) {
2650 PyErr_BadArgument();
2651 return NULL;
2652 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002653 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2654 PyUnicode_GET_SIZE(unicode));
2655
2656 if (!s)
2657 return NULL;
2658 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2659 PyBytes_GET_SIZE(s));
2660 Py_DECREF(s);
2661 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662}
2663
2664/* --- Raw Unicode Escape Codec ------------------------------------------- */
2665
2666PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002667 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668 const char *errors)
2669{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002670 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002671 Py_ssize_t startinpos;
2672 Py_ssize_t endinpos;
2673 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002674 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002675 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676 const char *end;
2677 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002678 PyObject *errorHandler = NULL;
2679 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002680
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681 /* Escaped strings will always be longer than the resulting
2682 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002683 length after conversion to the true value. (But decoding error
2684 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685 v = _PyUnicode_New(size);
2686 if (v == NULL)
2687 goto onError;
2688 if (size == 0)
2689 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002690 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002691 end = s + size;
2692 while (s < end) {
2693 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002694 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002696 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697
2698 /* Non-escape characters are interpreted as Unicode ordinals */
2699 if (*s != '\\') {
2700 *p++ = (unsigned char)*s++;
2701 continue;
2702 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002703 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704
2705 /* \u-escapes are only interpreted iff the number of leading
2706 backslashes if odd */
2707 bs = s;
2708 for (;s < end;) {
2709 if (*s != '\\')
2710 break;
2711 *p++ = (unsigned char)*s++;
2712 }
2713 if (((s - bs) & 1) == 0 ||
2714 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002715 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716 continue;
2717 }
2718 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002719 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 s++;
2721
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002722 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002723 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002724 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002725 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 endinpos = s-starts;
2728 if (unicode_decode_call_errorhandler(
2729 errors, &errorHandler,
2730 "rawunicodeescape", "truncated \\uXXXX",
2731 starts, size, &startinpos, &endinpos, &exc, &s,
2732 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 }
2736 x = (x<<4) & ~0xF;
2737 if (c >= '0' && c <= '9')
2738 x += c - '0';
2739 else if (c >= 'a' && c <= 'f')
2740 x += 10 + c - 'a';
2741 else
2742 x += 10 + c - 'A';
2743 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002744#ifndef Py_UNICODE_WIDE
2745 if (x > 0x10000) {
2746 if (unicode_decode_call_errorhandler(
2747 errors, &errorHandler,
2748 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2749 starts, size, &startinpos, &endinpos, &exc, &s,
2750 (PyObject **)&v, &outpos, &p))
2751 goto onError;
2752 }
2753#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002754 *p++ = x;
2755 nextByte:
2756 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002758 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002759 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002760 Py_XDECREF(errorHandler);
2761 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002763
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 onError:
2765 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002766 Py_XDECREF(errorHandler);
2767 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002768 return NULL;
2769}
2770
2771PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002772 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773{
2774 PyObject *repr;
2775 char *p;
2776 char *q;
2777
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002778#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002779 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002780#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002781 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002782#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783 if (repr == NULL)
2784 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002785 if (size == 0)
2786 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787
Walter Dörwald711005d2007-05-12 12:03:26 +00002788 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789 while (size-- > 0) {
2790 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002791#ifdef Py_UNICODE_WIDE
2792 /* Map 32-bit characters to '\Uxxxxxxxx' */
2793 if (ch >= 0x10000) {
2794 *p++ = '\\';
2795 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002796 *p++ = hexdigits[(ch >> 28) & 0xf];
2797 *p++ = hexdigits[(ch >> 24) & 0xf];
2798 *p++ = hexdigits[(ch >> 20) & 0xf];
2799 *p++ = hexdigits[(ch >> 16) & 0xf];
2800 *p++ = hexdigits[(ch >> 12) & 0xf];
2801 *p++ = hexdigits[(ch >> 8) & 0xf];
2802 *p++ = hexdigits[(ch >> 4) & 0xf];
2803 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002804 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002805 else
2806#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807 /* Map 16-bit characters to '\uxxxx' */
2808 if (ch >= 256) {
2809 *p++ = '\\';
2810 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002811 *p++ = hexdigits[(ch >> 12) & 0xf];
2812 *p++ = hexdigits[(ch >> 8) & 0xf];
2813 *p++ = hexdigits[(ch >> 4) & 0xf];
2814 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 }
2816 /* Copy everything else as-is */
2817 else
2818 *p++ = (char) ch;
2819 }
2820 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002821 if (PyBytes_Resize(repr, p - q)) {
2822 Py_DECREF(repr);
2823 return NULL;
2824 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002825 return repr;
2826}
2827
2828PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2829{
Walter Dörwald711005d2007-05-12 12:03:26 +00002830 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002832 PyErr_BadArgument();
2833 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002835 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2836 PyUnicode_GET_SIZE(unicode));
2837
2838 if (!s)
2839 return NULL;
2840 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2841 PyBytes_GET_SIZE(s));
2842 Py_DECREF(s);
2843 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002844}
2845
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002846/* --- Unicode Internal Codec ------------------------------------------- */
2847
2848PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002849 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002850 const char *errors)
2851{
2852 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002853 Py_ssize_t startinpos;
2854 Py_ssize_t endinpos;
2855 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002856 PyUnicodeObject *v;
2857 Py_UNICODE *p;
2858 const char *end;
2859 const char *reason;
2860 PyObject *errorHandler = NULL;
2861 PyObject *exc = NULL;
2862
Neal Norwitzd43069c2006-01-08 01:12:10 +00002863#ifdef Py_UNICODE_WIDE
2864 Py_UNICODE unimax = PyUnicode_GetMax();
2865#endif
2866
Thomas Wouters89f507f2006-12-13 04:49:30 +00002867 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002868 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2869 if (v == NULL)
2870 goto onError;
2871 if (PyUnicode_GetSize((PyObject *)v) == 0)
2872 return (PyObject *)v;
2873 p = PyUnicode_AS_UNICODE(v);
2874 end = s + size;
2875
2876 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002877 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002878 /* We have to sanity check the raw data, otherwise doom looms for
2879 some malformed UCS-4 data. */
2880 if (
2881 #ifdef Py_UNICODE_WIDE
2882 *p > unimax || *p < 0 ||
2883 #endif
2884 end-s < Py_UNICODE_SIZE
2885 )
2886 {
2887 startinpos = s - starts;
2888 if (end-s < Py_UNICODE_SIZE) {
2889 endinpos = end-starts;
2890 reason = "truncated input";
2891 }
2892 else {
2893 endinpos = s - starts + Py_UNICODE_SIZE;
2894 reason = "illegal code point (> 0x10FFFF)";
2895 }
2896 outpos = p - PyUnicode_AS_UNICODE(v);
2897 if (unicode_decode_call_errorhandler(
2898 errors, &errorHandler,
2899 "unicode_internal", reason,
2900 starts, size, &startinpos, &endinpos, &exc, &s,
2901 (PyObject **)&v, &outpos, &p)) {
2902 goto onError;
2903 }
2904 }
2905 else {
2906 p++;
2907 s += Py_UNICODE_SIZE;
2908 }
2909 }
2910
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002911 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002912 goto onError;
2913 Py_XDECREF(errorHandler);
2914 Py_XDECREF(exc);
2915 return (PyObject *)v;
2916
2917 onError:
2918 Py_XDECREF(v);
2919 Py_XDECREF(errorHandler);
2920 Py_XDECREF(exc);
2921 return NULL;
2922}
2923
Guido van Rossumd57fd912000-03-10 22:53:23 +00002924/* --- Latin-1 Codec ------------------------------------------------------ */
2925
2926PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002927 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002928 const char *errors)
2929{
2930 PyUnicodeObject *v;
2931 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002932
Guido van Rossumd57fd912000-03-10 22:53:23 +00002933 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002934 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002935 Py_UNICODE r = *(unsigned char*)s;
2936 return PyUnicode_FromUnicode(&r, 1);
2937 }
2938
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939 v = _PyUnicode_New(size);
2940 if (v == NULL)
2941 goto onError;
2942 if (size == 0)
2943 return (PyObject *)v;
2944 p = PyUnicode_AS_UNICODE(v);
2945 while (size-- > 0)
2946 *p++ = (unsigned char)*s++;
2947 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002948
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 onError:
2950 Py_XDECREF(v);
2951 return NULL;
2952}
2953
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002954/* create or adjust a UnicodeEncodeError */
2955static void make_encode_exception(PyObject **exceptionObject,
2956 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002957 const Py_UNICODE *unicode, Py_ssize_t size,
2958 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002959 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002960{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002961 if (*exceptionObject == NULL) {
2962 *exceptionObject = PyUnicodeEncodeError_Create(
2963 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964 }
2965 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002966 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2967 goto onError;
2968 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2969 goto onError;
2970 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2971 goto onError;
2972 return;
2973 onError:
2974 Py_DECREF(*exceptionObject);
2975 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976 }
2977}
2978
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002979/* raises a UnicodeEncodeError */
2980static void raise_encode_exception(PyObject **exceptionObject,
2981 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002982 const Py_UNICODE *unicode, Py_ssize_t size,
2983 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002984 const char *reason)
2985{
2986 make_encode_exception(exceptionObject,
2987 encoding, unicode, size, startpos, endpos, reason);
2988 if (*exceptionObject != NULL)
2989 PyCodec_StrictErrors(*exceptionObject);
2990}
2991
2992/* error handling callback helper:
2993 build arguments, call the callback and check the arguments,
2994 put the result into newpos and return the replacement string, which
2995 has to be freed by the caller */
2996static PyObject *unicode_encode_call_errorhandler(const char *errors,
2997 PyObject **errorHandler,
2998 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002999 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3000 Py_ssize_t startpos, Py_ssize_t endpos,
3001 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003002{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003003 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003004
3005 PyObject *restuple;
3006 PyObject *resunicode;
3007
3008 if (*errorHandler == NULL) {
3009 *errorHandler = PyCodec_LookupError(errors);
3010 if (*errorHandler == NULL)
3011 return NULL;
3012 }
3013
3014 make_encode_exception(exceptionObject,
3015 encoding, unicode, size, startpos, endpos, reason);
3016 if (*exceptionObject == NULL)
3017 return NULL;
3018
3019 restuple = PyObject_CallFunctionObjArgs(
3020 *errorHandler, *exceptionObject, NULL);
3021 if (restuple == NULL)
3022 return NULL;
3023 if (!PyTuple_Check(restuple)) {
3024 PyErr_Format(PyExc_TypeError, &argparse[4]);
3025 Py_DECREF(restuple);
3026 return NULL;
3027 }
3028 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3029 &resunicode, newpos)) {
3030 Py_DECREF(restuple);
3031 return NULL;
3032 }
3033 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003034 *newpos = size+*newpos;
3035 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003036 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003037 Py_DECREF(restuple);
3038 return NULL;
3039 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003040 Py_INCREF(resunicode);
3041 Py_DECREF(restuple);
3042 return resunicode;
3043}
3044
3045static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003046 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003047 const char *errors,
3048 int limit)
3049{
3050 /* output object */
3051 PyObject *res;
3052 /* pointers to the beginning and end+1 of input */
3053 const Py_UNICODE *startp = p;
3054 const Py_UNICODE *endp = p + size;
3055 /* pointer to the beginning of the unencodable characters */
3056 /* const Py_UNICODE *badp = NULL; */
3057 /* pointer into the output */
3058 char *str;
3059 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003060 Py_ssize_t respos = 0;
3061 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003062 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3063 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003064 PyObject *errorHandler = NULL;
3065 PyObject *exc = NULL;
3066 /* the following variable is used for caching string comparisons
3067 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3068 int known_errorHandler = -1;
3069
3070 /* allocate enough for a simple encoding without
3071 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003072 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003073 if (res == NULL)
3074 goto onError;
3075 if (size == 0)
3076 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003077 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003078 ressize = size;
3079
3080 while (p<endp) {
3081 Py_UNICODE c = *p;
3082
3083 /* can we encode this? */
3084 if (c<limit) {
3085 /* no overflow check, because we know that the space is enough */
3086 *str++ = (char)c;
3087 ++p;
3088 }
3089 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003090 Py_ssize_t unicodepos = p-startp;
3091 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003092 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003093 Py_ssize_t repsize;
3094 Py_ssize_t newpos;
3095 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003096 Py_UNICODE *uni2;
3097 /* startpos for collecting unencodable chars */
3098 const Py_UNICODE *collstart = p;
3099 const Py_UNICODE *collend = p;
3100 /* find all unecodable characters */
3101 while ((collend < endp) && ((*collend)>=limit))
3102 ++collend;
3103 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3104 if (known_errorHandler==-1) {
3105 if ((errors==NULL) || (!strcmp(errors, "strict")))
3106 known_errorHandler = 1;
3107 else if (!strcmp(errors, "replace"))
3108 known_errorHandler = 2;
3109 else if (!strcmp(errors, "ignore"))
3110 known_errorHandler = 3;
3111 else if (!strcmp(errors, "xmlcharrefreplace"))
3112 known_errorHandler = 4;
3113 else
3114 known_errorHandler = 0;
3115 }
3116 switch (known_errorHandler) {
3117 case 1: /* strict */
3118 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3119 goto onError;
3120 case 2: /* replace */
3121 while (collstart++<collend)
3122 *str++ = '?'; /* fall through */
3123 case 3: /* ignore */
3124 p = collend;
3125 break;
3126 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003127 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003128 /* determine replacement size (temporarily (mis)uses p) */
3129 for (p = collstart, repsize = 0; p < collend; ++p) {
3130 if (*p<10)
3131 repsize += 2+1+1;
3132 else if (*p<100)
3133 repsize += 2+2+1;
3134 else if (*p<1000)
3135 repsize += 2+3+1;
3136 else if (*p<10000)
3137 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003138#ifndef Py_UNICODE_WIDE
3139 else
3140 repsize += 2+5+1;
3141#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003142 else if (*p<100000)
3143 repsize += 2+5+1;
3144 else if (*p<1000000)
3145 repsize += 2+6+1;
3146 else
3147 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003148#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003149 }
3150 requiredsize = respos+repsize+(endp-collend);
3151 if (requiredsize > ressize) {
3152 if (requiredsize<2*ressize)
3153 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003154 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003155 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003156 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003157 ressize = requiredsize;
3158 }
3159 /* generate replacement (temporarily (mis)uses p) */
3160 for (p = collstart; p < collend; ++p) {
3161 str += sprintf(str, "&#%d;", (int)*p);
3162 }
3163 p = collend;
3164 break;
3165 default:
3166 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3167 encoding, reason, startp, size, &exc,
3168 collstart-startp, collend-startp, &newpos);
3169 if (repunicode == NULL)
3170 goto onError;
3171 /* need more space? (at least enough for what we
3172 have+the replacement+the rest of the string, so
3173 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003174 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003175 repsize = PyUnicode_GET_SIZE(repunicode);
3176 requiredsize = respos+repsize+(endp-collend);
3177 if (requiredsize > ressize) {
3178 if (requiredsize<2*ressize)
3179 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003180 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003181 Py_DECREF(repunicode);
3182 goto onError;
3183 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003184 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003185 ressize = requiredsize;
3186 }
3187 /* check if there is anything unencodable in the replacement
3188 and copy it to the output */
3189 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3190 c = *uni2;
3191 if (c >= limit) {
3192 raise_encode_exception(&exc, encoding, startp, size,
3193 unicodepos, unicodepos+1, reason);
3194 Py_DECREF(repunicode);
3195 goto onError;
3196 }
3197 *str = (char)c;
3198 }
3199 p = startp + newpos;
3200 Py_DECREF(repunicode);
3201 }
3202 }
3203 }
3204 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003205 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003206 if (respos<ressize)
3207 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003208 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003209 Py_XDECREF(errorHandler);
3210 Py_XDECREF(exc);
3211 return res;
3212
3213 onError:
3214 Py_XDECREF(res);
3215 Py_XDECREF(errorHandler);
3216 Py_XDECREF(exc);
3217 return NULL;
3218}
3219
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003221 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222 const char *errors)
3223{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003224 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225}
3226
3227PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3228{
3229 if (!PyUnicode_Check(unicode)) {
3230 PyErr_BadArgument();
3231 return NULL;
3232 }
3233 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3234 PyUnicode_GET_SIZE(unicode),
3235 NULL);
3236}
3237
3238/* --- 7-bit ASCII Codec -------------------------------------------------- */
3239
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003241 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242 const char *errors)
3243{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003244 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 PyUnicodeObject *v;
3246 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003247 Py_ssize_t startinpos;
3248 Py_ssize_t endinpos;
3249 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003250 const char *e;
3251 PyObject *errorHandler = NULL;
3252 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003253
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003255 if (size == 1 && *(unsigned char*)s < 128) {
3256 Py_UNICODE r = *(unsigned char*)s;
3257 return PyUnicode_FromUnicode(&r, 1);
3258 }
Tim Petersced69f82003-09-16 20:30:58 +00003259
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 v = _PyUnicode_New(size);
3261 if (v == NULL)
3262 goto onError;
3263 if (size == 0)
3264 return (PyObject *)v;
3265 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003266 e = s + size;
3267 while (s < e) {
3268 register unsigned char c = (unsigned char)*s;
3269 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003271 ++s;
3272 }
3273 else {
3274 startinpos = s-starts;
3275 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003276 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003277 if (unicode_decode_call_errorhandler(
3278 errors, &errorHandler,
3279 "ascii", "ordinal not in range(128)",
3280 starts, size, &startinpos, &endinpos, &exc, &s,
3281 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003282 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003284 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003285 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003286 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003287 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003288 Py_XDECREF(errorHandler);
3289 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003291
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292 onError:
3293 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003294 Py_XDECREF(errorHandler);
3295 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296 return NULL;
3297}
3298
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003300 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301 const char *errors)
3302{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003303 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304}
3305
3306PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3307{
3308 if (!PyUnicode_Check(unicode)) {
3309 PyErr_BadArgument();
3310 return NULL;
3311 }
3312 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3313 PyUnicode_GET_SIZE(unicode),
3314 NULL);
3315}
3316
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003317#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003318
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003319/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003320
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003321#if SIZEOF_INT < SIZEOF_SSIZE_T
3322#define NEED_RETRY
3323#endif
3324
3325/* XXX This code is limited to "true" double-byte encodings, as
3326 a) it assumes an incomplete character consists of a single byte, and
3327 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3328 encodings, see IsDBCSLeadByteEx documentation. */
3329
3330static int is_dbcs_lead_byte(const char *s, int offset)
3331{
3332 const char *curr = s + offset;
3333
3334 if (IsDBCSLeadByte(*curr)) {
3335 const char *prev = CharPrev(s, curr);
3336 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3337 }
3338 return 0;
3339}
3340
3341/*
3342 * Decode MBCS string into unicode object. If 'final' is set, converts
3343 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3344 */
3345static int decode_mbcs(PyUnicodeObject **v,
3346 const char *s, /* MBCS string */
3347 int size, /* sizeof MBCS string */
3348 int final)
3349{
3350 Py_UNICODE *p;
3351 Py_ssize_t n = 0;
3352 int usize = 0;
3353
3354 assert(size >= 0);
3355
3356 /* Skip trailing lead-byte unless 'final' is set */
3357 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3358 --size;
3359
3360 /* First get the size of the result */
3361 if (size > 0) {
3362 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3363 if (usize == 0) {
3364 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3365 return -1;
3366 }
3367 }
3368
3369 if (*v == NULL) {
3370 /* Create unicode object */
3371 *v = _PyUnicode_New(usize);
3372 if (*v == NULL)
3373 return -1;
3374 }
3375 else {
3376 /* Extend unicode object */
3377 n = PyUnicode_GET_SIZE(*v);
3378 if (_PyUnicode_Resize(v, n + usize) < 0)
3379 return -1;
3380 }
3381
3382 /* Do the conversion */
3383 if (size > 0) {
3384 p = PyUnicode_AS_UNICODE(*v) + n;
3385 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3386 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3387 return -1;
3388 }
3389 }
3390
3391 return size;
3392}
3393
3394PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3395 Py_ssize_t size,
3396 const char *errors,
3397 Py_ssize_t *consumed)
3398{
3399 PyUnicodeObject *v = NULL;
3400 int done;
3401
3402 if (consumed)
3403 *consumed = 0;
3404
3405#ifdef NEED_RETRY
3406 retry:
3407 if (size > INT_MAX)
3408 done = decode_mbcs(&v, s, INT_MAX, 0);
3409 else
3410#endif
3411 done = decode_mbcs(&v, s, (int)size, !consumed);
3412
3413 if (done < 0) {
3414 Py_XDECREF(v);
3415 return NULL;
3416 }
3417
3418 if (consumed)
3419 *consumed += done;
3420
3421#ifdef NEED_RETRY
3422 if (size > INT_MAX) {
3423 s += done;
3424 size -= done;
3425 goto retry;
3426 }
3427#endif
3428
3429 return (PyObject *)v;
3430}
3431
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003432PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003433 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003434 const char *errors)
3435{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003436 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3437}
3438
3439/*
3440 * Convert unicode into string object (MBCS).
3441 * Returns 0 if succeed, -1 otherwise.
3442 */
3443static int encode_mbcs(PyObject **repr,
3444 const Py_UNICODE *p, /* unicode */
3445 int size) /* size of unicode */
3446{
3447 int mbcssize = 0;
3448 Py_ssize_t n = 0;
3449
3450 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003451
3452 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003453 if (size > 0) {
3454 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3455 if (mbcssize == 0) {
3456 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3457 return -1;
3458 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003459 }
3460
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003461 if (*repr == NULL) {
3462 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003463 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003464 if (*repr == NULL)
3465 return -1;
3466 }
3467 else {
3468 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003469 n = PyBytes_Size(*repr);
3470 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003471 return -1;
3472 }
3473
3474 /* Do the conversion */
3475 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003476 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003477 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3478 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3479 return -1;
3480 }
3481 }
3482
3483 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003484}
3485
3486PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003487 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003488 const char *errors)
3489{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003490 PyObject *repr = NULL;
3491 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003492
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003493#ifdef NEED_RETRY
3494 retry:
3495 if (size > INT_MAX)
3496 ret = encode_mbcs(&repr, p, INT_MAX);
3497 else
3498#endif
3499 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003500
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003501 if (ret < 0) {
3502 Py_XDECREF(repr);
3503 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003504 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003505
3506#ifdef NEED_RETRY
3507 if (size > INT_MAX) {
3508 p += INT_MAX;
3509 size -= INT_MAX;
3510 goto retry;
3511 }
3512#endif
3513
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003514 return repr;
3515}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003516
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003517PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3518{
3519 if (!PyUnicode_Check(unicode)) {
3520 PyErr_BadArgument();
3521 return NULL;
3522 }
3523 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3524 PyUnicode_GET_SIZE(unicode),
3525 NULL);
3526}
3527
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003528#undef NEED_RETRY
3529
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003530#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003531
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532/* --- Character Mapping Codec -------------------------------------------- */
3533
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003535 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536 PyObject *mapping,
3537 const char *errors)
3538{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003540 Py_ssize_t startinpos;
3541 Py_ssize_t endinpos;
3542 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544 PyUnicodeObject *v;
3545 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003546 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003547 PyObject *errorHandler = NULL;
3548 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003549 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003550 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003551
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552 /* Default to Latin-1 */
3553 if (mapping == NULL)
3554 return PyUnicode_DecodeLatin1(s, size, errors);
3555
3556 v = _PyUnicode_New(size);
3557 if (v == NULL)
3558 goto onError;
3559 if (size == 0)
3560 return (PyObject *)v;
3561 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003563 if (PyUnicode_CheckExact(mapping)) {
3564 mapstring = PyUnicode_AS_UNICODE(mapping);
3565 maplen = PyUnicode_GET_SIZE(mapping);
3566 while (s < e) {
3567 unsigned char ch = *s;
3568 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003569
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003570 if (ch < maplen)
3571 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003573 if (x == 0xfffe) {
3574 /* undefined mapping */
3575 outpos = p-PyUnicode_AS_UNICODE(v);
3576 startinpos = s-starts;
3577 endinpos = startinpos+1;
3578 if (unicode_decode_call_errorhandler(
3579 errors, &errorHandler,
3580 "charmap", "character maps to <undefined>",
3581 starts, size, &startinpos, &endinpos, &exc, &s,
3582 (PyObject **)&v, &outpos, &p)) {
3583 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003584 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003585 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003586 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003587 *p++ = x;
3588 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003590 }
3591 else {
3592 while (s < e) {
3593 unsigned char ch = *s;
3594 PyObject *w, *x;
3595
3596 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3597 w = PyInt_FromLong((long)ch);
3598 if (w == NULL)
3599 goto onError;
3600 x = PyObject_GetItem(mapping, w);
3601 Py_DECREF(w);
3602 if (x == NULL) {
3603 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3604 /* No mapping found means: mapping is undefined. */
3605 PyErr_Clear();
3606 x = Py_None;
3607 Py_INCREF(x);
3608 } else
3609 goto onError;
3610 }
3611
3612 /* Apply mapping */
3613 if (PyInt_Check(x)) {
3614 long value = PyInt_AS_LONG(x);
3615 if (value < 0 || value > 65535) {
3616 PyErr_SetString(PyExc_TypeError,
3617 "character mapping must be in range(65536)");
3618 Py_DECREF(x);
3619 goto onError;
3620 }
3621 *p++ = (Py_UNICODE)value;
3622 }
3623 else if (x == Py_None) {
3624 /* undefined mapping */
3625 outpos = p-PyUnicode_AS_UNICODE(v);
3626 startinpos = s-starts;
3627 endinpos = startinpos+1;
3628 if (unicode_decode_call_errorhandler(
3629 errors, &errorHandler,
3630 "charmap", "character maps to <undefined>",
3631 starts, size, &startinpos, &endinpos, &exc, &s,
3632 (PyObject **)&v, &outpos, &p)) {
3633 Py_DECREF(x);
3634 goto onError;
3635 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003636 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003637 continue;
3638 }
3639 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003640 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003641
3642 if (targetsize == 1)
3643 /* 1-1 mapping */
3644 *p++ = *PyUnicode_AS_UNICODE(x);
3645
3646 else if (targetsize > 1) {
3647 /* 1-n mapping */
3648 if (targetsize > extrachars) {
3649 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003650 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3651 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003652 (targetsize << 2);
3653 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003654 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003655 if (_PyUnicode_Resize(&v,
3656 PyUnicode_GET_SIZE(v) + needed) < 0) {
3657 Py_DECREF(x);
3658 goto onError;
3659 }
3660 p = PyUnicode_AS_UNICODE(v) + oldpos;
3661 }
3662 Py_UNICODE_COPY(p,
3663 PyUnicode_AS_UNICODE(x),
3664 targetsize);
3665 p += targetsize;
3666 extrachars -= targetsize;
3667 }
3668 /* 1-0 mapping: skip the character */
3669 }
3670 else {
3671 /* wrong return value */
3672 PyErr_SetString(PyExc_TypeError,
3673 "character mapping must return integer, None or unicode");
3674 Py_DECREF(x);
3675 goto onError;
3676 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003678 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680 }
3681 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003682 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003684 Py_XDECREF(errorHandler);
3685 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003687
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003689 Py_XDECREF(errorHandler);
3690 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691 Py_XDECREF(v);
3692 return NULL;
3693}
3694
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003695/* Charmap encoding: the lookup table */
3696
3697struct encoding_map{
3698 PyObject_HEAD
3699 unsigned char level1[32];
3700 int count2, count3;
3701 unsigned char level23[1];
3702};
3703
3704static PyObject*
3705encoding_map_size(PyObject *obj, PyObject* args)
3706{
3707 struct encoding_map *map = (struct encoding_map*)obj;
3708 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3709 128*map->count3);
3710}
3711
3712static PyMethodDef encoding_map_methods[] = {
3713 {"size", encoding_map_size, METH_NOARGS,
3714 PyDoc_STR("Return the size (in bytes) of this object") },
3715 { 0 }
3716};
3717
3718static void
3719encoding_map_dealloc(PyObject* o)
3720{
3721 PyObject_FREE(o);
3722}
3723
3724static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003725 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003726 "EncodingMap", /*tp_name*/
3727 sizeof(struct encoding_map), /*tp_basicsize*/
3728 0, /*tp_itemsize*/
3729 /* methods */
3730 encoding_map_dealloc, /*tp_dealloc*/
3731 0, /*tp_print*/
3732 0, /*tp_getattr*/
3733 0, /*tp_setattr*/
3734 0, /*tp_compare*/
3735 0, /*tp_repr*/
3736 0, /*tp_as_number*/
3737 0, /*tp_as_sequence*/
3738 0, /*tp_as_mapping*/
3739 0, /*tp_hash*/
3740 0, /*tp_call*/
3741 0, /*tp_str*/
3742 0, /*tp_getattro*/
3743 0, /*tp_setattro*/
3744 0, /*tp_as_buffer*/
3745 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3746 0, /*tp_doc*/
3747 0, /*tp_traverse*/
3748 0, /*tp_clear*/
3749 0, /*tp_richcompare*/
3750 0, /*tp_weaklistoffset*/
3751 0, /*tp_iter*/
3752 0, /*tp_iternext*/
3753 encoding_map_methods, /*tp_methods*/
3754 0, /*tp_members*/
3755 0, /*tp_getset*/
3756 0, /*tp_base*/
3757 0, /*tp_dict*/
3758 0, /*tp_descr_get*/
3759 0, /*tp_descr_set*/
3760 0, /*tp_dictoffset*/
3761 0, /*tp_init*/
3762 0, /*tp_alloc*/
3763 0, /*tp_new*/
3764 0, /*tp_free*/
3765 0, /*tp_is_gc*/
3766};
3767
3768PyObject*
3769PyUnicode_BuildEncodingMap(PyObject* string)
3770{
3771 Py_UNICODE *decode;
3772 PyObject *result;
3773 struct encoding_map *mresult;
3774 int i;
3775 int need_dict = 0;
3776 unsigned char level1[32];
3777 unsigned char level2[512];
3778 unsigned char *mlevel1, *mlevel2, *mlevel3;
3779 int count2 = 0, count3 = 0;
3780
3781 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3782 PyErr_BadArgument();
3783 return NULL;
3784 }
3785 decode = PyUnicode_AS_UNICODE(string);
3786 memset(level1, 0xFF, sizeof level1);
3787 memset(level2, 0xFF, sizeof level2);
3788
3789 /* If there isn't a one-to-one mapping of NULL to \0,
3790 or if there are non-BMP characters, we need to use
3791 a mapping dictionary. */
3792 if (decode[0] != 0)
3793 need_dict = 1;
3794 for (i = 1; i < 256; i++) {
3795 int l1, l2;
3796 if (decode[i] == 0
3797 #ifdef Py_UNICODE_WIDE
3798 || decode[i] > 0xFFFF
3799 #endif
3800 ) {
3801 need_dict = 1;
3802 break;
3803 }
3804 if (decode[i] == 0xFFFE)
3805 /* unmapped character */
3806 continue;
3807 l1 = decode[i] >> 11;
3808 l2 = decode[i] >> 7;
3809 if (level1[l1] == 0xFF)
3810 level1[l1] = count2++;
3811 if (level2[l2] == 0xFF)
3812 level2[l2] = count3++;
3813 }
3814
3815 if (count2 >= 0xFF || count3 >= 0xFF)
3816 need_dict = 1;
3817
3818 if (need_dict) {
3819 PyObject *result = PyDict_New();
3820 PyObject *key, *value;
3821 if (!result)
3822 return NULL;
3823 for (i = 0; i < 256; i++) {
3824 key = value = NULL;
3825 key = PyInt_FromLong(decode[i]);
3826 value = PyInt_FromLong(i);
3827 if (!key || !value)
3828 goto failed1;
3829 if (PyDict_SetItem(result, key, value) == -1)
3830 goto failed1;
3831 Py_DECREF(key);
3832 Py_DECREF(value);
3833 }
3834 return result;
3835 failed1:
3836 Py_XDECREF(key);
3837 Py_XDECREF(value);
3838 Py_DECREF(result);
3839 return NULL;
3840 }
3841
3842 /* Create a three-level trie */
3843 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3844 16*count2 + 128*count3 - 1);
3845 if (!result)
3846 return PyErr_NoMemory();
3847 PyObject_Init(result, &EncodingMapType);
3848 mresult = (struct encoding_map*)result;
3849 mresult->count2 = count2;
3850 mresult->count3 = count3;
3851 mlevel1 = mresult->level1;
3852 mlevel2 = mresult->level23;
3853 mlevel3 = mresult->level23 + 16*count2;
3854 memcpy(mlevel1, level1, 32);
3855 memset(mlevel2, 0xFF, 16*count2);
3856 memset(mlevel3, 0, 128*count3);
3857 count3 = 0;
3858 for (i = 1; i < 256; i++) {
3859 int o1, o2, o3, i2, i3;
3860 if (decode[i] == 0xFFFE)
3861 /* unmapped character */
3862 continue;
3863 o1 = decode[i]>>11;
3864 o2 = (decode[i]>>7) & 0xF;
3865 i2 = 16*mlevel1[o1] + o2;
3866 if (mlevel2[i2] == 0xFF)
3867 mlevel2[i2] = count3++;
3868 o3 = decode[i] & 0x7F;
3869 i3 = 128*mlevel2[i2] + o3;
3870 mlevel3[i3] = i;
3871 }
3872 return result;
3873}
3874
3875static int
3876encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3877{
3878 struct encoding_map *map = (struct encoding_map*)mapping;
3879 int l1 = c>>11;
3880 int l2 = (c>>7) & 0xF;
3881 int l3 = c & 0x7F;
3882 int i;
3883
3884#ifdef Py_UNICODE_WIDE
3885 if (c > 0xFFFF) {
3886 return -1;
3887 }
3888#endif
3889 if (c == 0)
3890 return 0;
3891 /* level 1*/
3892 i = map->level1[l1];
3893 if (i == 0xFF) {
3894 return -1;
3895 }
3896 /* level 2*/
3897 i = map->level23[16*i+l2];
3898 if (i == 0xFF) {
3899 return -1;
3900 }
3901 /* level 3 */
3902 i = map->level23[16*map->count2 + 128*i + l3];
3903 if (i == 0) {
3904 return -1;
3905 }
3906 return i;
3907}
3908
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003909/* Lookup the character ch in the mapping. If the character
3910 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003911 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003912static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914 PyObject *w = PyInt_FromLong((long)c);
3915 PyObject *x;
3916
3917 if (w == NULL)
3918 return NULL;
3919 x = PyObject_GetItem(mapping, w);
3920 Py_DECREF(w);
3921 if (x == NULL) {
3922 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3923 /* No mapping found means: mapping is undefined. */
3924 PyErr_Clear();
3925 x = Py_None;
3926 Py_INCREF(x);
3927 return x;
3928 } else
3929 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003931 else if (x == Py_None)
3932 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003933 else if (PyInt_Check(x)) {
3934 long value = PyInt_AS_LONG(x);
3935 if (value < 0 || value > 255) {
3936 PyErr_SetString(PyExc_TypeError,
3937 "character mapping must be in range(256)");
3938 Py_DECREF(x);
3939 return NULL;
3940 }
3941 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003943 else if (PyString_Check(x))
3944 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003946 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003947 PyErr_Format(PyExc_TypeError,
3948 "character mapping must return integer, None or str8, not %.400s",
3949 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003950 Py_DECREF(x);
3951 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003952 }
3953}
3954
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003955static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003956charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003957{
Walter Dörwald827b0552007-05-12 13:23:53 +00003958 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003959 /* exponentially overallocate to minimize reallocations */
3960 if (requiredsize < 2*outsize)
3961 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003962 if (PyBytes_Resize(outobj, requiredsize)) {
3963 Py_DECREF(outobj);
3964 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003965 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003966 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003967}
3968
3969typedef enum charmapencode_result {
3970 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3971}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003972/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003973 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003974 space is available. Return a new reference to the object that
3975 was put in the output buffer, or Py_None, if the mapping was undefined
3976 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003977 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003978static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003979charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003980 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003982 PyObject *rep;
3983 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00003984 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003986 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003987 int res = encoding_map_lookup(c, mapping);
3988 Py_ssize_t requiredsize = *outpos+1;
3989 if (res == -1)
3990 return enc_FAILED;
3991 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003992 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003993 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00003994 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003995 outstart[(*outpos)++] = (char)res;
3996 return enc_SUCCESS;
3997 }
3998
3999 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004001 return enc_EXCEPTION;
4002 else if (rep==Py_None) {
4003 Py_DECREF(rep);
4004 return enc_FAILED;
4005 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004006 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004007 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004008 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004009 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004010 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004011 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004012 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004013 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4015 }
4016 else {
4017 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004018 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4019 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004020 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004021 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004022 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004023 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004024 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004025 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026 memcpy(outstart + *outpos, repchars, repsize);
4027 *outpos += repsize;
4028 }
4029 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004030 Py_DECREF(rep);
4031 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032}
4033
4034/* handle an error in PyUnicode_EncodeCharmap
4035 Return 0 on success, -1 on error */
4036static
4037int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004038 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004040 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004041 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042{
4043 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004044 Py_ssize_t repsize;
4045 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004046 Py_UNICODE *uni2;
4047 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004048 Py_ssize_t collstartpos = *inpos;
4049 Py_ssize_t collendpos = *inpos+1;
4050 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004051 char *encoding = "charmap";
4052 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004053 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004055 /* find all unencodable characters */
4056 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004057 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004058 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004059 int res = encoding_map_lookup(p[collendpos], mapping);
4060 if (res != -1)
4061 break;
4062 ++collendpos;
4063 continue;
4064 }
4065
4066 rep = charmapencode_lookup(p[collendpos], mapping);
4067 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004069 else if (rep!=Py_None) {
4070 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 break;
4072 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004073 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004074 ++collendpos;
4075 }
4076 /* cache callback name lookup
4077 * (if not done yet, i.e. it's the first error) */
4078 if (*known_errorHandler==-1) {
4079 if ((errors==NULL) || (!strcmp(errors, "strict")))
4080 *known_errorHandler = 1;
4081 else if (!strcmp(errors, "replace"))
4082 *known_errorHandler = 2;
4083 else if (!strcmp(errors, "ignore"))
4084 *known_errorHandler = 3;
4085 else if (!strcmp(errors, "xmlcharrefreplace"))
4086 *known_errorHandler = 4;
4087 else
4088 *known_errorHandler = 0;
4089 }
4090 switch (*known_errorHandler) {
4091 case 1: /* strict */
4092 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4093 return -1;
4094 case 2: /* replace */
4095 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4096 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004097 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004098 return -1;
4099 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004100 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004101 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4102 return -1;
4103 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004104 }
4105 /* fall through */
4106 case 3: /* ignore */
4107 *inpos = collendpos;
4108 break;
4109 case 4: /* xmlcharrefreplace */
4110 /* generate replacement (temporarily (mis)uses p) */
4111 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4112 char buffer[2+29+1+1];
4113 char *cp;
4114 sprintf(buffer, "&#%d;", (int)p[collpos]);
4115 for (cp = buffer; *cp; ++cp) {
4116 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004117 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004118 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004119 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4121 return -1;
4122 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 }
4124 }
4125 *inpos = collendpos;
4126 break;
4127 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004128 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004129 encoding, reason, p, size, exceptionObject,
4130 collstartpos, collendpos, &newpos);
4131 if (repunicode == NULL)
4132 return -1;
4133 /* generate replacement */
4134 repsize = PyUnicode_GET_SIZE(repunicode);
4135 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4136 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004137 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 return -1;
4139 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004140 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004142 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4143 return -1;
4144 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145 }
4146 *inpos = newpos;
4147 Py_DECREF(repunicode);
4148 }
4149 return 0;
4150}
4151
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004153 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154 PyObject *mapping,
4155 const char *errors)
4156{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157 /* output object */
4158 PyObject *res = NULL;
4159 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004160 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004162 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163 PyObject *errorHandler = NULL;
4164 PyObject *exc = NULL;
4165 /* the following variable is used for caching string comparisons
4166 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4167 * 3=ignore, 4=xmlcharrefreplace */
4168 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169
4170 /* Default to Latin-1 */
4171 if (mapping == NULL)
4172 return PyUnicode_EncodeLatin1(p, size, errors);
4173
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004174 /* allocate enough for a simple encoding without
4175 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004176 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004177 if (res == NULL)
4178 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004179 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004180 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004182 while (inpos<size) {
4183 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004184 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004185 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004187 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188 if (charmap_encoding_error(p, size, &inpos, mapping,
4189 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004190 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004191 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004192 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004193 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195 else
4196 /* done with this character => adjust input position */
4197 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004201 if (respos<PyBytes_GET_SIZE(res)) {
4202 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203 goto onError;
4204 }
4205 Py_XDECREF(exc);
4206 Py_XDECREF(errorHandler);
4207 return res;
4208
4209 onError:
4210 Py_XDECREF(res);
4211 Py_XDECREF(exc);
4212 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004213 return NULL;
4214}
4215
4216PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4217 PyObject *mapping)
4218{
4219 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4220 PyErr_BadArgument();
4221 return NULL;
4222 }
4223 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4224 PyUnicode_GET_SIZE(unicode),
4225 mapping,
4226 NULL);
4227}
4228
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004229/* create or adjust a UnicodeTranslateError */
4230static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004231 const Py_UNICODE *unicode, Py_ssize_t size,
4232 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004233 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004234{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235 if (*exceptionObject == NULL) {
4236 *exceptionObject = PyUnicodeTranslateError_Create(
4237 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004238 }
4239 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004240 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4241 goto onError;
4242 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4243 goto onError;
4244 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4245 goto onError;
4246 return;
4247 onError:
4248 Py_DECREF(*exceptionObject);
4249 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004250 }
4251}
4252
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004253/* raises a UnicodeTranslateError */
4254static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004255 const Py_UNICODE *unicode, Py_ssize_t size,
4256 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004257 const char *reason)
4258{
4259 make_translate_exception(exceptionObject,
4260 unicode, size, startpos, endpos, reason);
4261 if (*exceptionObject != NULL)
4262 PyCodec_StrictErrors(*exceptionObject);
4263}
4264
4265/* error handling callback helper:
4266 build arguments, call the callback and check the arguments,
4267 put the result into newpos and return the replacement string, which
4268 has to be freed by the caller */
4269static PyObject *unicode_translate_call_errorhandler(const char *errors,
4270 PyObject **errorHandler,
4271 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004272 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4273 Py_ssize_t startpos, Py_ssize_t endpos,
4274 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004275{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004276 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004277
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004278 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004279 PyObject *restuple;
4280 PyObject *resunicode;
4281
4282 if (*errorHandler == NULL) {
4283 *errorHandler = PyCodec_LookupError(errors);
4284 if (*errorHandler == NULL)
4285 return NULL;
4286 }
4287
4288 make_translate_exception(exceptionObject,
4289 unicode, size, startpos, endpos, reason);
4290 if (*exceptionObject == NULL)
4291 return NULL;
4292
4293 restuple = PyObject_CallFunctionObjArgs(
4294 *errorHandler, *exceptionObject, NULL);
4295 if (restuple == NULL)
4296 return NULL;
4297 if (!PyTuple_Check(restuple)) {
4298 PyErr_Format(PyExc_TypeError, &argparse[4]);
4299 Py_DECREF(restuple);
4300 return NULL;
4301 }
4302 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004303 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004304 Py_DECREF(restuple);
4305 return NULL;
4306 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004307 if (i_newpos<0)
4308 *newpos = size+i_newpos;
4309 else
4310 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004311 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004312 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004313 Py_DECREF(restuple);
4314 return NULL;
4315 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004316 Py_INCREF(resunicode);
4317 Py_DECREF(restuple);
4318 return resunicode;
4319}
4320
4321/* Lookup the character ch in the mapping and put the result in result,
4322 which must be decrefed by the caller.
4323 Return 0 on success, -1 on error */
4324static
4325int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4326{
4327 PyObject *w = PyInt_FromLong((long)c);
4328 PyObject *x;
4329
4330 if (w == NULL)
4331 return -1;
4332 x = PyObject_GetItem(mapping, w);
4333 Py_DECREF(w);
4334 if (x == NULL) {
4335 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4336 /* No mapping found means: use 1:1 mapping. */
4337 PyErr_Clear();
4338 *result = NULL;
4339 return 0;
4340 } else
4341 return -1;
4342 }
4343 else if (x == Py_None) {
4344 *result = x;
4345 return 0;
4346 }
4347 else if (PyInt_Check(x)) {
4348 long value = PyInt_AS_LONG(x);
4349 long max = PyUnicode_GetMax();
4350 if (value < 0 || value > max) {
4351 PyErr_Format(PyExc_TypeError,
4352 "character mapping must be in range(0x%lx)", max+1);
4353 Py_DECREF(x);
4354 return -1;
4355 }
4356 *result = x;
4357 return 0;
4358 }
4359 else if (PyUnicode_Check(x)) {
4360 *result = x;
4361 return 0;
4362 }
4363 else {
4364 /* wrong return value */
4365 PyErr_SetString(PyExc_TypeError,
4366 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004367 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004368 return -1;
4369 }
4370}
4371/* ensure that *outobj is at least requiredsize characters long,
4372if not reallocate and adjust various state variables.
4373Return 0 on success, -1 on error */
4374static
Walter Dörwald4894c302003-10-24 14:25:28 +00004375int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004376 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004377{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004378 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004379 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004381 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004382 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004383 if (requiredsize < 2 * oldsize)
4384 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004385 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004386 return -1;
4387 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004388 }
4389 return 0;
4390}
4391/* lookup the character, put the result in the output string and adjust
4392 various state variables. Return a new reference to the object that
4393 was put in the output buffer in *result, or Py_None, if the mapping was
4394 undefined (in which case no character was written).
4395 The called must decref result.
4396 Return 0 on success, -1 on error. */
4397static
Walter Dörwald4894c302003-10-24 14:25:28 +00004398int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004399 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004400 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004401{
Walter Dörwald4894c302003-10-24 14:25:28 +00004402 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403 return -1;
4404 if (*res==NULL) {
4405 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004406 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 }
4408 else if (*res==Py_None)
4409 ;
4410 else if (PyInt_Check(*res)) {
4411 /* no overflow check, because we know that the space is enough */
4412 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4413 }
4414 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004415 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004416 if (repsize==1) {
4417 /* no overflow check, because we know that the space is enough */
4418 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4419 }
4420 else if (repsize!=0) {
4421 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004422 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004423 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004424 repsize - 1;
4425 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426 return -1;
4427 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4428 *outp += repsize;
4429 }
4430 }
4431 else
4432 return -1;
4433 return 0;
4434}
4435
4436PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004437 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438 PyObject *mapping,
4439 const char *errors)
4440{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 /* output object */
4442 PyObject *res = NULL;
4443 /* pointers to the beginning and end+1 of input */
4444 const Py_UNICODE *startp = p;
4445 const Py_UNICODE *endp = p + size;
4446 /* pointer into the output */
4447 Py_UNICODE *str;
4448 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004449 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 char *reason = "character maps to <undefined>";
4451 PyObject *errorHandler = NULL;
4452 PyObject *exc = NULL;
4453 /* the following variable is used for caching string comparisons
4454 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4455 * 3=ignore, 4=xmlcharrefreplace */
4456 int known_errorHandler = -1;
4457
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 if (mapping == NULL) {
4459 PyErr_BadArgument();
4460 return NULL;
4461 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004462
4463 /* allocate enough for a simple 1:1 translation without
4464 replacements, if we need more, we'll resize */
4465 res = PyUnicode_FromUnicode(NULL, size);
4466 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004467 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004469 return res;
4470 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 while (p<endp) {
4473 /* try to encode it */
4474 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004475 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 goto onError;
4478 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004479 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004480 if (x!=Py_None) /* it worked => adjust input pointer */
4481 ++p;
4482 else { /* untranslatable character */
4483 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004484 Py_ssize_t repsize;
4485 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004486 Py_UNICODE *uni2;
4487 /* startpos for collecting untranslatable chars */
4488 const Py_UNICODE *collstart = p;
4489 const Py_UNICODE *collend = p+1;
4490 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004492 /* find all untranslatable characters */
4493 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004494 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 goto onError;
4496 Py_XDECREF(x);
4497 if (x!=Py_None)
4498 break;
4499 ++collend;
4500 }
4501 /* cache callback name lookup
4502 * (if not done yet, i.e. it's the first error) */
4503 if (known_errorHandler==-1) {
4504 if ((errors==NULL) || (!strcmp(errors, "strict")))
4505 known_errorHandler = 1;
4506 else if (!strcmp(errors, "replace"))
4507 known_errorHandler = 2;
4508 else if (!strcmp(errors, "ignore"))
4509 known_errorHandler = 3;
4510 else if (!strcmp(errors, "xmlcharrefreplace"))
4511 known_errorHandler = 4;
4512 else
4513 known_errorHandler = 0;
4514 }
4515 switch (known_errorHandler) {
4516 case 1: /* strict */
4517 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4518 goto onError;
4519 case 2: /* replace */
4520 /* No need to check for space, this is a 1:1 replacement */
4521 for (coll = collstart; coll<collend; ++coll)
4522 *str++ = '?';
4523 /* fall through */
4524 case 3: /* ignore */
4525 p = collend;
4526 break;
4527 case 4: /* xmlcharrefreplace */
4528 /* generate replacement (temporarily (mis)uses p) */
4529 for (p = collstart; p < collend; ++p) {
4530 char buffer[2+29+1+1];
4531 char *cp;
4532 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004533 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004534 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4535 goto onError;
4536 for (cp = buffer; *cp; ++cp)
4537 *str++ = *cp;
4538 }
4539 p = collend;
4540 break;
4541 default:
4542 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4543 reason, startp, size, &exc,
4544 collstart-startp, collend-startp, &newpos);
4545 if (repunicode == NULL)
4546 goto onError;
4547 /* generate replacement */
4548 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004549 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4551 Py_DECREF(repunicode);
4552 goto onError;
4553 }
4554 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4555 *str++ = *uni2;
4556 p = startp + newpos;
4557 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004558 }
4559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561 /* Resize if we allocated to much */
4562 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004563 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004564 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004565 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566 }
4567 Py_XDECREF(exc);
4568 Py_XDECREF(errorHandler);
4569 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571 onError:
4572 Py_XDECREF(res);
4573 Py_XDECREF(exc);
4574 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575 return NULL;
4576}
4577
4578PyObject *PyUnicode_Translate(PyObject *str,
4579 PyObject *mapping,
4580 const char *errors)
4581{
4582 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004583
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584 str = PyUnicode_FromObject(str);
4585 if (str == NULL)
4586 goto onError;
4587 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4588 PyUnicode_GET_SIZE(str),
4589 mapping,
4590 errors);
4591 Py_DECREF(str);
4592 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004593
Guido van Rossumd57fd912000-03-10 22:53:23 +00004594 onError:
4595 Py_XDECREF(str);
4596 return NULL;
4597}
Tim Petersced69f82003-09-16 20:30:58 +00004598
Guido van Rossum9e896b32000-04-05 20:11:21 +00004599/* --- Decimal Encoder ---------------------------------------------------- */
4600
4601int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004602 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004603 char *output,
4604 const char *errors)
4605{
4606 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004607 PyObject *errorHandler = NULL;
4608 PyObject *exc = NULL;
4609 const char *encoding = "decimal";
4610 const char *reason = "invalid decimal Unicode string";
4611 /* the following variable is used for caching string comparisons
4612 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4613 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004614
4615 if (output == NULL) {
4616 PyErr_BadArgument();
4617 return -1;
4618 }
4619
4620 p = s;
4621 end = s + length;
4622 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004624 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004625 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004626 Py_ssize_t repsize;
4627 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004628 Py_UNICODE *uni2;
4629 Py_UNICODE *collstart;
4630 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004631
Guido van Rossum9e896b32000-04-05 20:11:21 +00004632 if (Py_UNICODE_ISSPACE(ch)) {
4633 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004634 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004635 continue;
4636 }
4637 decimal = Py_UNICODE_TODECIMAL(ch);
4638 if (decimal >= 0) {
4639 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004641 continue;
4642 }
Guido van Rossumba477042000-04-06 18:18:10 +00004643 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004644 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004645 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004646 continue;
4647 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004648 /* All other characters are considered unencodable */
4649 collstart = p;
4650 collend = p+1;
4651 while (collend < end) {
4652 if ((0 < *collend && *collend < 256) ||
4653 !Py_UNICODE_ISSPACE(*collend) ||
4654 Py_UNICODE_TODECIMAL(*collend))
4655 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004656 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657 /* cache callback name lookup
4658 * (if not done yet, i.e. it's the first error) */
4659 if (known_errorHandler==-1) {
4660 if ((errors==NULL) || (!strcmp(errors, "strict")))
4661 known_errorHandler = 1;
4662 else if (!strcmp(errors, "replace"))
4663 known_errorHandler = 2;
4664 else if (!strcmp(errors, "ignore"))
4665 known_errorHandler = 3;
4666 else if (!strcmp(errors, "xmlcharrefreplace"))
4667 known_errorHandler = 4;
4668 else
4669 known_errorHandler = 0;
4670 }
4671 switch (known_errorHandler) {
4672 case 1: /* strict */
4673 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4674 goto onError;
4675 case 2: /* replace */
4676 for (p = collstart; p < collend; ++p)
4677 *output++ = '?';
4678 /* fall through */
4679 case 3: /* ignore */
4680 p = collend;
4681 break;
4682 case 4: /* xmlcharrefreplace */
4683 /* generate replacement (temporarily (mis)uses p) */
4684 for (p = collstart; p < collend; ++p)
4685 output += sprintf(output, "&#%d;", (int)*p);
4686 p = collend;
4687 break;
4688 default:
4689 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4690 encoding, reason, s, length, &exc,
4691 collstart-s, collend-s, &newpos);
4692 if (repunicode == NULL)
4693 goto onError;
4694 /* generate replacement */
4695 repsize = PyUnicode_GET_SIZE(repunicode);
4696 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4697 Py_UNICODE ch = *uni2;
4698 if (Py_UNICODE_ISSPACE(ch))
4699 *output++ = ' ';
4700 else {
4701 decimal = Py_UNICODE_TODECIMAL(ch);
4702 if (decimal >= 0)
4703 *output++ = '0' + decimal;
4704 else if (0 < ch && ch < 256)
4705 *output++ = (char)ch;
4706 else {
4707 Py_DECREF(repunicode);
4708 raise_encode_exception(&exc, encoding,
4709 s, length, collstart-s, collend-s, reason);
4710 goto onError;
4711 }
4712 }
4713 }
4714 p = s + newpos;
4715 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004716 }
4717 }
4718 /* 0-terminate the output string */
4719 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004720 Py_XDECREF(exc);
4721 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004722 return 0;
4723
4724 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004725 Py_XDECREF(exc);
4726 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004727 return -1;
4728}
4729
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730/* --- Helpers ------------------------------------------------------------ */
4731
Thomas Wouters477c8d52006-05-27 19:21:47 +00004732#define STRINGLIB_CHAR Py_UNICODE
4733
4734#define STRINGLIB_LEN PyUnicode_GET_SIZE
4735#define STRINGLIB_NEW PyUnicode_FromUnicode
4736#define STRINGLIB_STR PyUnicode_AS_UNICODE
4737
4738Py_LOCAL_INLINE(int)
4739STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004741 if (str[0] != other[0])
4742 return 1;
4743 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744}
4745
Thomas Wouters477c8d52006-05-27 19:21:47 +00004746#define STRINGLIB_EMPTY unicode_empty
4747
4748#include "stringlib/fastsearch.h"
4749
4750#include "stringlib/count.h"
4751#include "stringlib/find.h"
4752#include "stringlib/partition.h"
4753
4754/* helper macro to fixup start/end slice values */
4755#define FIX_START_END(obj) \
4756 if (start < 0) \
4757 start += (obj)->length; \
4758 if (start < 0) \
4759 start = 0; \
4760 if (end > (obj)->length) \
4761 end = (obj)->length; \
4762 if (end < 0) \
4763 end += (obj)->length; \
4764 if (end < 0) \
4765 end = 0;
4766
Martin v. Löwis18e16552006-02-15 17:27:45 +00004767Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004768 PyObject *substr,
4769 Py_ssize_t start,
4770 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004772 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004773 PyUnicodeObject* str_obj;
4774 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004775
Thomas Wouters477c8d52006-05-27 19:21:47 +00004776 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4777 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004779 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4780 if (!sub_obj) {
4781 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782 return -1;
4783 }
Tim Petersced69f82003-09-16 20:30:58 +00004784
Thomas Wouters477c8d52006-05-27 19:21:47 +00004785 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004786
Thomas Wouters477c8d52006-05-27 19:21:47 +00004787 result = stringlib_count(
4788 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4789 );
4790
4791 Py_DECREF(sub_obj);
4792 Py_DECREF(str_obj);
4793
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794 return result;
4795}
4796
Martin v. Löwis18e16552006-02-15 17:27:45 +00004797Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004798 PyObject *sub,
4799 Py_ssize_t start,
4800 Py_ssize_t end,
4801 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004803 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004804
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004806 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004807 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004808 sub = PyUnicode_FromObject(sub);
4809 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004810 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004811 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812 }
Tim Petersced69f82003-09-16 20:30:58 +00004813
Thomas Wouters477c8d52006-05-27 19:21:47 +00004814 if (direction > 0)
4815 result = stringlib_find_slice(
4816 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4817 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4818 start, end
4819 );
4820 else
4821 result = stringlib_rfind_slice(
4822 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4823 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4824 start, end
4825 );
4826
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004828 Py_DECREF(sub);
4829
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830 return result;
4831}
4832
Tim Petersced69f82003-09-16 20:30:58 +00004833static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834int tailmatch(PyUnicodeObject *self,
4835 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004836 Py_ssize_t start,
4837 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838 int direction)
4839{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 if (substring->length == 0)
4841 return 1;
4842
Thomas Wouters477c8d52006-05-27 19:21:47 +00004843 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844
4845 end -= substring->length;
4846 if (end < start)
4847 return 0;
4848
4849 if (direction > 0) {
4850 if (Py_UNICODE_MATCH(self, end, substring))
4851 return 1;
4852 } else {
4853 if (Py_UNICODE_MATCH(self, start, substring))
4854 return 1;
4855 }
4856
4857 return 0;
4858}
4859
Martin v. Löwis18e16552006-02-15 17:27:45 +00004860Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004862 Py_ssize_t start,
4863 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 int direction)
4865{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004866 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004867
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868 str = PyUnicode_FromObject(str);
4869 if (str == NULL)
4870 return -1;
4871 substr = PyUnicode_FromObject(substr);
4872 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004873 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874 return -1;
4875 }
Tim Petersced69f82003-09-16 20:30:58 +00004876
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 result = tailmatch((PyUnicodeObject *)str,
4878 (PyUnicodeObject *)substr,
4879 start, end, direction);
4880 Py_DECREF(str);
4881 Py_DECREF(substr);
4882 return result;
4883}
4884
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885/* Apply fixfct filter to the Unicode object self and return a
4886 reference to the modified object */
4887
Tim Petersced69f82003-09-16 20:30:58 +00004888static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889PyObject *fixup(PyUnicodeObject *self,
4890 int (*fixfct)(PyUnicodeObject *s))
4891{
4892
4893 PyUnicodeObject *u;
4894
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004895 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896 if (u == NULL)
4897 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004898
4899 Py_UNICODE_COPY(u->str, self->str, self->length);
4900
Tim Peters7a29bd52001-09-12 03:03:31 +00004901 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902 /* fixfct should return TRUE if it modified the buffer. If
4903 FALSE, return a reference to the original buffer instead
4904 (to save space, not time) */
4905 Py_INCREF(self);
4906 Py_DECREF(u);
4907 return (PyObject*) self;
4908 }
4909 return (PyObject*) u;
4910}
4911
Tim Petersced69f82003-09-16 20:30:58 +00004912static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913int fixupper(PyUnicodeObject *self)
4914{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004915 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916 Py_UNICODE *s = self->str;
4917 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004918
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919 while (len-- > 0) {
4920 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004921
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922 ch = Py_UNICODE_TOUPPER(*s);
4923 if (ch != *s) {
4924 status = 1;
4925 *s = ch;
4926 }
4927 s++;
4928 }
4929
4930 return status;
4931}
4932
Tim Petersced69f82003-09-16 20:30:58 +00004933static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934int fixlower(PyUnicodeObject *self)
4935{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004936 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 Py_UNICODE *s = self->str;
4938 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004939
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940 while (len-- > 0) {
4941 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004942
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943 ch = Py_UNICODE_TOLOWER(*s);
4944 if (ch != *s) {
4945 status = 1;
4946 *s = ch;
4947 }
4948 s++;
4949 }
4950
4951 return status;
4952}
4953
Tim Petersced69f82003-09-16 20:30:58 +00004954static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955int fixswapcase(PyUnicodeObject *self)
4956{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004957 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958 Py_UNICODE *s = self->str;
4959 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004960
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961 while (len-- > 0) {
4962 if (Py_UNICODE_ISUPPER(*s)) {
4963 *s = Py_UNICODE_TOLOWER(*s);
4964 status = 1;
4965 } else if (Py_UNICODE_ISLOWER(*s)) {
4966 *s = Py_UNICODE_TOUPPER(*s);
4967 status = 1;
4968 }
4969 s++;
4970 }
4971
4972 return status;
4973}
4974
Tim Petersced69f82003-09-16 20:30:58 +00004975static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004976int fixcapitalize(PyUnicodeObject *self)
4977{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004978 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004979 Py_UNICODE *s = self->str;
4980 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004981
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004982 if (len == 0)
4983 return 0;
4984 if (Py_UNICODE_ISLOWER(*s)) {
4985 *s = Py_UNICODE_TOUPPER(*s);
4986 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004988 s++;
4989 while (--len > 0) {
4990 if (Py_UNICODE_ISUPPER(*s)) {
4991 *s = Py_UNICODE_TOLOWER(*s);
4992 status = 1;
4993 }
4994 s++;
4995 }
4996 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004997}
4998
4999static
5000int fixtitle(PyUnicodeObject *self)
5001{
5002 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5003 register Py_UNICODE *e;
5004 int previous_is_cased;
5005
5006 /* Shortcut for single character strings */
5007 if (PyUnicode_GET_SIZE(self) == 1) {
5008 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5009 if (*p != ch) {
5010 *p = ch;
5011 return 1;
5012 }
5013 else
5014 return 0;
5015 }
Tim Petersced69f82003-09-16 20:30:58 +00005016
Guido van Rossumd57fd912000-03-10 22:53:23 +00005017 e = p + PyUnicode_GET_SIZE(self);
5018 previous_is_cased = 0;
5019 for (; p < e; p++) {
5020 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005021
Guido van Rossumd57fd912000-03-10 22:53:23 +00005022 if (previous_is_cased)
5023 *p = Py_UNICODE_TOLOWER(ch);
5024 else
5025 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005026
5027 if (Py_UNICODE_ISLOWER(ch) ||
5028 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005029 Py_UNICODE_ISTITLE(ch))
5030 previous_is_cased = 1;
5031 else
5032 previous_is_cased = 0;
5033 }
5034 return 1;
5035}
5036
Tim Peters8ce9f162004-08-27 01:49:32 +00005037PyObject *
5038PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005039{
Tim Peters8ce9f162004-08-27 01:49:32 +00005040 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005041 const Py_UNICODE blank = ' ';
5042 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005043 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005044 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005045 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5046 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005047 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5048 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005049 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005050 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005051 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052
Tim Peters05eba1f2004-08-27 21:32:02 +00005053 fseq = PySequence_Fast(seq, "");
5054 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005055 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005056 }
5057
Tim Peters91879ab2004-08-27 22:35:44 +00005058 /* Grrrr. A codec may be invoked to convert str objects to
5059 * Unicode, and so it's possible to call back into Python code
5060 * during PyUnicode_FromObject(), and so it's possible for a sick
5061 * codec to change the size of fseq (if seq is a list). Therefore
5062 * we have to keep refetching the size -- can't assume seqlen
5063 * is invariant.
5064 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005065 seqlen = PySequence_Fast_GET_SIZE(fseq);
5066 /* If empty sequence, return u"". */
5067 if (seqlen == 0) {
5068 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5069 goto Done;
5070 }
5071 /* If singleton sequence with an exact Unicode, return that. */
5072 if (seqlen == 1) {
5073 item = PySequence_Fast_GET_ITEM(fseq, 0);
5074 if (PyUnicode_CheckExact(item)) {
5075 Py_INCREF(item);
5076 res = (PyUnicodeObject *)item;
5077 goto Done;
5078 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005079 }
5080
Tim Peters05eba1f2004-08-27 21:32:02 +00005081 /* At least two items to join, or one that isn't exact Unicode. */
5082 if (seqlen > 1) {
5083 /* Set up sep and seplen -- they're needed. */
5084 if (separator == NULL) {
5085 sep = &blank;
5086 seplen = 1;
5087 }
5088 else {
5089 internal_separator = PyUnicode_FromObject(separator);
5090 if (internal_separator == NULL)
5091 goto onError;
5092 sep = PyUnicode_AS_UNICODE(internal_separator);
5093 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005094 /* In case PyUnicode_FromObject() mutated seq. */
5095 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005096 }
5097 }
5098
5099 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005100 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005101 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005102 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005103 res_p = PyUnicode_AS_UNICODE(res);
5104 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005105
Tim Peters05eba1f2004-08-27 21:32:02 +00005106 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005107 Py_ssize_t itemlen;
5108 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005109
5110 item = PySequence_Fast_GET_ITEM(fseq, i);
5111 /* Convert item to Unicode. */
5112 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5113 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005114 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005115 " %.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00005116 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005117 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005118 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005119 item = PyUnicode_FromObject(item);
5120 if (item == NULL)
5121 goto onError;
5122 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005123
Tim Peters91879ab2004-08-27 22:35:44 +00005124 /* In case PyUnicode_FromObject() mutated seq. */
5125 seqlen = PySequence_Fast_GET_SIZE(fseq);
5126
Tim Peters8ce9f162004-08-27 01:49:32 +00005127 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005128 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005129 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005130 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005131 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005132 if (i < seqlen - 1) {
5133 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005134 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005135 goto Overflow;
5136 }
5137 if (new_res_used > res_alloc) {
5138 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005139 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005140 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005141 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005142 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005143 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005144 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005145 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005147 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005148 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005150
5151 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005152 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005153 res_p += itemlen;
5154 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005155 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005156 res_p += seplen;
5157 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005159 res_used = new_res_used;
5160 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005161
Tim Peters05eba1f2004-08-27 21:32:02 +00005162 /* Shrink res to match the used area; this probably can't fail,
5163 * but it's cheap to check.
5164 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005165 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005166 goto onError;
5167
5168 Done:
5169 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005170 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171 return (PyObject *)res;
5172
Tim Peters8ce9f162004-08-27 01:49:32 +00005173 Overflow:
5174 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005175 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005176 Py_DECREF(item);
5177 /* fall through */
5178
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005180 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005181 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005182 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 return NULL;
5184}
5185
Tim Petersced69f82003-09-16 20:30:58 +00005186static
5187PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005188 Py_ssize_t left,
5189 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190 Py_UNICODE fill)
5191{
5192 PyUnicodeObject *u;
5193
5194 if (left < 0)
5195 left = 0;
5196 if (right < 0)
5197 right = 0;
5198
Tim Peters7a29bd52001-09-12 03:03:31 +00005199 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200 Py_INCREF(self);
5201 return self;
5202 }
5203
5204 u = _PyUnicode_New(left + self->length + right);
5205 if (u) {
5206 if (left)
5207 Py_UNICODE_FILL(u->str, fill, left);
5208 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5209 if (right)
5210 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5211 }
5212
5213 return u;
5214}
5215
5216#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005217 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218 if (!str) \
5219 goto onError; \
5220 if (PyList_Append(list, str)) { \
5221 Py_DECREF(str); \
5222 goto onError; \
5223 } \
5224 else \
5225 Py_DECREF(str);
5226
5227static
5228PyObject *split_whitespace(PyUnicodeObject *self,
5229 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005230 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005232 register Py_ssize_t i;
5233 register Py_ssize_t j;
5234 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235 PyObject *str;
5236
5237 for (i = j = 0; i < len; ) {
5238 /* find a token */
5239 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5240 i++;
5241 j = i;
5242 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5243 i++;
5244 if (j < i) {
5245 if (maxcount-- <= 0)
5246 break;
5247 SPLIT_APPEND(self->str, j, i);
5248 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5249 i++;
5250 j = i;
5251 }
5252 }
5253 if (j < len) {
5254 SPLIT_APPEND(self->str, j, len);
5255 }
5256 return list;
5257
5258 onError:
5259 Py_DECREF(list);
5260 return NULL;
5261}
5262
5263PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005264 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005266 register Py_ssize_t i;
5267 register Py_ssize_t j;
5268 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269 PyObject *list;
5270 PyObject *str;
5271 Py_UNICODE *data;
5272
5273 string = PyUnicode_FromObject(string);
5274 if (string == NULL)
5275 return NULL;
5276 data = PyUnicode_AS_UNICODE(string);
5277 len = PyUnicode_GET_SIZE(string);
5278
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279 list = PyList_New(0);
5280 if (!list)
5281 goto onError;
5282
5283 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005284 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005285
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005287 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289
5290 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005291 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 if (i < len) {
5293 if (data[i] == '\r' && i + 1 < len &&
5294 data[i+1] == '\n')
5295 i += 2;
5296 else
5297 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005298 if (keepends)
5299 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300 }
Guido van Rossum86662912000-04-11 15:38:46 +00005301 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302 j = i;
5303 }
5304 if (j < len) {
5305 SPLIT_APPEND(data, j, len);
5306 }
5307
5308 Py_DECREF(string);
5309 return list;
5310
5311 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005312 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 Py_DECREF(string);
5314 return NULL;
5315}
5316
Tim Petersced69f82003-09-16 20:30:58 +00005317static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318PyObject *split_char(PyUnicodeObject *self,
5319 PyObject *list,
5320 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005321 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005323 register Py_ssize_t i;
5324 register Py_ssize_t j;
5325 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 PyObject *str;
5327
5328 for (i = j = 0; i < len; ) {
5329 if (self->str[i] == ch) {
5330 if (maxcount-- <= 0)
5331 break;
5332 SPLIT_APPEND(self->str, j, i);
5333 i = j = i + 1;
5334 } else
5335 i++;
5336 }
5337 if (j <= len) {
5338 SPLIT_APPEND(self->str, j, len);
5339 }
5340 return list;
5341
5342 onError:
5343 Py_DECREF(list);
5344 return NULL;
5345}
5346
Tim Petersced69f82003-09-16 20:30:58 +00005347static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348PyObject *split_substring(PyUnicodeObject *self,
5349 PyObject *list,
5350 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005351 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005353 register Py_ssize_t i;
5354 register Py_ssize_t j;
5355 Py_ssize_t len = self->length;
5356 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357 PyObject *str;
5358
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005359 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 if (Py_UNICODE_MATCH(self, i, substring)) {
5361 if (maxcount-- <= 0)
5362 break;
5363 SPLIT_APPEND(self->str, j, i);
5364 i = j = i + sublen;
5365 } else
5366 i++;
5367 }
5368 if (j <= len) {
5369 SPLIT_APPEND(self->str, j, len);
5370 }
5371 return list;
5372
5373 onError:
5374 Py_DECREF(list);
5375 return NULL;
5376}
5377
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005378static
5379PyObject *rsplit_whitespace(PyUnicodeObject *self,
5380 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005381 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005382{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005383 register Py_ssize_t i;
5384 register Py_ssize_t j;
5385 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005386 PyObject *str;
5387
5388 for (i = j = len - 1; i >= 0; ) {
5389 /* find a token */
5390 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5391 i--;
5392 j = i;
5393 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5394 i--;
5395 if (j > i) {
5396 if (maxcount-- <= 0)
5397 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005398 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005399 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5400 i--;
5401 j = i;
5402 }
5403 }
5404 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005405 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005406 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005407 if (PyList_Reverse(list) < 0)
5408 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005409 return list;
5410
5411 onError:
5412 Py_DECREF(list);
5413 return NULL;
5414}
5415
5416static
5417PyObject *rsplit_char(PyUnicodeObject *self,
5418 PyObject *list,
5419 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005420 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005421{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005422 register Py_ssize_t i;
5423 register Py_ssize_t j;
5424 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005425 PyObject *str;
5426
5427 for (i = j = len - 1; i >= 0; ) {
5428 if (self->str[i] == ch) {
5429 if (maxcount-- <= 0)
5430 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005431 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005432 j = i = i - 1;
5433 } else
5434 i--;
5435 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005436 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005437 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005438 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005439 if (PyList_Reverse(list) < 0)
5440 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005441 return list;
5442
5443 onError:
5444 Py_DECREF(list);
5445 return NULL;
5446}
5447
5448static
5449PyObject *rsplit_substring(PyUnicodeObject *self,
5450 PyObject *list,
5451 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005452 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005453{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005454 register Py_ssize_t i;
5455 register Py_ssize_t j;
5456 Py_ssize_t len = self->length;
5457 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005458 PyObject *str;
5459
5460 for (i = len - sublen, j = len; i >= 0; ) {
5461 if (Py_UNICODE_MATCH(self, i, substring)) {
5462 if (maxcount-- <= 0)
5463 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005464 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005465 j = i;
5466 i -= sublen;
5467 } else
5468 i--;
5469 }
5470 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005471 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005472 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005473 if (PyList_Reverse(list) < 0)
5474 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005475 return list;
5476
5477 onError:
5478 Py_DECREF(list);
5479 return NULL;
5480}
5481
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482#undef SPLIT_APPEND
5483
5484static
5485PyObject *split(PyUnicodeObject *self,
5486 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005487 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488{
5489 PyObject *list;
5490
5491 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005492 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493
5494 list = PyList_New(0);
5495 if (!list)
5496 return NULL;
5497
5498 if (substring == NULL)
5499 return split_whitespace(self,list,maxcount);
5500
5501 else if (substring->length == 1)
5502 return split_char(self,list,substring->str[0],maxcount);
5503
5504 else if (substring->length == 0) {
5505 Py_DECREF(list);
5506 PyErr_SetString(PyExc_ValueError, "empty separator");
5507 return NULL;
5508 }
5509 else
5510 return split_substring(self,list,substring,maxcount);
5511}
5512
Tim Petersced69f82003-09-16 20:30:58 +00005513static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005514PyObject *rsplit(PyUnicodeObject *self,
5515 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005516 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005517{
5518 PyObject *list;
5519
5520 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005521 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005522
5523 list = PyList_New(0);
5524 if (!list)
5525 return NULL;
5526
5527 if (substring == NULL)
5528 return rsplit_whitespace(self,list,maxcount);
5529
5530 else if (substring->length == 1)
5531 return rsplit_char(self,list,substring->str[0],maxcount);
5532
5533 else if (substring->length == 0) {
5534 Py_DECREF(list);
5535 PyErr_SetString(PyExc_ValueError, "empty separator");
5536 return NULL;
5537 }
5538 else
5539 return rsplit_substring(self,list,substring,maxcount);
5540}
5541
5542static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543PyObject *replace(PyUnicodeObject *self,
5544 PyUnicodeObject *str1,
5545 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005546 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547{
5548 PyUnicodeObject *u;
5549
5550 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005551 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552
Thomas Wouters477c8d52006-05-27 19:21:47 +00005553 if (str1->length == str2->length) {
5554 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005555 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005556 if (str1->length == 1) {
5557 /* replace characters */
5558 Py_UNICODE u1, u2;
5559 if (!findchar(self->str, self->length, str1->str[0]))
5560 goto nothing;
5561 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5562 if (!u)
5563 return NULL;
5564 Py_UNICODE_COPY(u->str, self->str, self->length);
5565 u1 = str1->str[0];
5566 u2 = str2->str[0];
5567 for (i = 0; i < u->length; i++)
5568 if (u->str[i] == u1) {
5569 if (--maxcount < 0)
5570 break;
5571 u->str[i] = u2;
5572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005574 i = fastsearch(
5575 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005577 if (i < 0)
5578 goto nothing;
5579 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5580 if (!u)
5581 return NULL;
5582 Py_UNICODE_COPY(u->str, self->str, self->length);
5583 while (i <= self->length - str1->length)
5584 if (Py_UNICODE_MATCH(self, i, str1)) {
5585 if (--maxcount < 0)
5586 break;
5587 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5588 i += str1->length;
5589 } else
5590 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005593
5594 Py_ssize_t n, i, j, e;
5595 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596 Py_UNICODE *p;
5597
5598 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005599 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600 if (n > maxcount)
5601 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005602 if (n == 0)
5603 goto nothing;
5604 /* new_size = self->length + n * (str2->length - str1->length)); */
5605 delta = (str2->length - str1->length);
5606 if (delta == 0) {
5607 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005609 product = n * (str2->length - str1->length);
5610 if ((product / (str2->length - str1->length)) != n) {
5611 PyErr_SetString(PyExc_OverflowError,
5612 "replace string is too long");
5613 return NULL;
5614 }
5615 new_size = self->length + product;
5616 if (new_size < 0) {
5617 PyErr_SetString(PyExc_OverflowError,
5618 "replace string is too long");
5619 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 }
5621 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005622 u = _PyUnicode_New(new_size);
5623 if (!u)
5624 return NULL;
5625 i = 0;
5626 p = u->str;
5627 e = self->length - str1->length;
5628 if (str1->length > 0) {
5629 while (n-- > 0) {
5630 /* look for next match */
5631 j = i;
5632 while (j <= e) {
5633 if (Py_UNICODE_MATCH(self, j, str1))
5634 break;
5635 j++;
5636 }
5637 if (j > i) {
5638 if (j > e)
5639 break;
5640 /* copy unchanged part [i:j] */
5641 Py_UNICODE_COPY(p, self->str+i, j-i);
5642 p += j - i;
5643 }
5644 /* copy substitution string */
5645 if (str2->length > 0) {
5646 Py_UNICODE_COPY(p, str2->str, str2->length);
5647 p += str2->length;
5648 }
5649 i = j + str1->length;
5650 }
5651 if (i < self->length)
5652 /* copy tail [i:] */
5653 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5654 } else {
5655 /* interleave */
5656 while (n > 0) {
5657 Py_UNICODE_COPY(p, str2->str, str2->length);
5658 p += str2->length;
5659 if (--n <= 0)
5660 break;
5661 *p++ = self->str[i++];
5662 }
5663 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5664 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005667
5668nothing:
5669 /* nothing to replace; return original string (when possible) */
5670 if (PyUnicode_CheckExact(self)) {
5671 Py_INCREF(self);
5672 return (PyObject *) self;
5673 }
5674 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675}
5676
5677/* --- Unicode Object Methods --------------------------------------------- */
5678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005679PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680"S.title() -> unicode\n\
5681\n\
5682Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005683characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684
5685static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005686unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 return fixup(self, fixtitle);
5689}
5690
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005691PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692"S.capitalize() -> unicode\n\
5693\n\
5694Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005695have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696
5697static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005698unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 return fixup(self, fixcapitalize);
5701}
5702
5703#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005704PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705"S.capwords() -> unicode\n\
5706\n\
5707Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005708normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709
5710static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005711unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712{
5713 PyObject *list;
5714 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005715 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717 /* Split into words */
5718 list = split(self, NULL, -1);
5719 if (!list)
5720 return NULL;
5721
5722 /* Capitalize each word */
5723 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5724 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5725 fixcapitalize);
5726 if (item == NULL)
5727 goto onError;
5728 Py_DECREF(PyList_GET_ITEM(list, i));
5729 PyList_SET_ITEM(list, i, item);
5730 }
5731
5732 /* Join the words to form a new string */
5733 item = PyUnicode_Join(NULL, list);
5734
5735onError:
5736 Py_DECREF(list);
5737 return (PyObject *)item;
5738}
5739#endif
5740
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005741/* Argument converter. Coerces to a single unicode character */
5742
5743static int
5744convert_uc(PyObject *obj, void *addr)
5745{
5746 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5747 PyObject *uniobj;
5748 Py_UNICODE *unistr;
5749
5750 uniobj = PyUnicode_FromObject(obj);
5751 if (uniobj == NULL) {
5752 PyErr_SetString(PyExc_TypeError,
5753 "The fill character cannot be converted to Unicode");
5754 return 0;
5755 }
5756 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5757 PyErr_SetString(PyExc_TypeError,
5758 "The fill character must be exactly one character long");
5759 Py_DECREF(uniobj);
5760 return 0;
5761 }
5762 unistr = PyUnicode_AS_UNICODE(uniobj);
5763 *fillcharloc = unistr[0];
5764 Py_DECREF(uniobj);
5765 return 1;
5766}
5767
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005768PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005769"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005771Return S centered in a Unicode string of length width. Padding is\n\
5772done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773
5774static PyObject *
5775unicode_center(PyUnicodeObject *self, PyObject *args)
5776{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005777 Py_ssize_t marg, left;
5778 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005779 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780
Thomas Woutersde017742006-02-16 19:34:37 +00005781 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782 return NULL;
5783
Tim Peters7a29bd52001-09-12 03:03:31 +00005784 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785 Py_INCREF(self);
5786 return (PyObject*) self;
5787 }
5788
5789 marg = width - self->length;
5790 left = marg / 2 + (marg & width & 1);
5791
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005792 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793}
5794
Marc-André Lemburge5034372000-08-08 08:04:29 +00005795#if 0
5796
5797/* This code should go into some future Unicode collation support
5798 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005799 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005801/* speedy UTF-16 code point order comparison */
5802/* gleaned from: */
5803/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5804
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005805static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005806{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005807 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005808 0, 0, 0, 0, 0, 0, 0, 0,
5809 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005810 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005811};
5812
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813static int
5814unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5815{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005816 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005817
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818 Py_UNICODE *s1 = str1->str;
5819 Py_UNICODE *s2 = str2->str;
5820
5821 len1 = str1->length;
5822 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005823
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005825 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005826
5827 c1 = *s1++;
5828 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005829
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005830 if (c1 > (1<<11) * 26)
5831 c1 += utf16Fixup[c1>>11];
5832 if (c2 > (1<<11) * 26)
5833 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005834 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005835
5836 if (c1 != c2)
5837 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005838
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005839 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840 }
5841
5842 return (len1 < len2) ? -1 : (len1 != len2);
5843}
5844
Marc-André Lemburge5034372000-08-08 08:04:29 +00005845#else
5846
5847static int
5848unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5849{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005850 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005851
5852 Py_UNICODE *s1 = str1->str;
5853 Py_UNICODE *s2 = str2->str;
5854
5855 len1 = str1->length;
5856 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005857
Marc-André Lemburge5034372000-08-08 08:04:29 +00005858 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005859 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005860
Fredrik Lundh45714e92001-06-26 16:39:36 +00005861 c1 = *s1++;
5862 c2 = *s2++;
5863
5864 if (c1 != c2)
5865 return (c1 < c2) ? -1 : 1;
5866
Marc-André Lemburge5034372000-08-08 08:04:29 +00005867 len1--; len2--;
5868 }
5869
5870 return (len1 < len2) ? -1 : (len1 != len2);
5871}
5872
5873#endif
5874
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875int PyUnicode_Compare(PyObject *left,
5876 PyObject *right)
5877{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005878 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5879 return unicode_compare((PyUnicodeObject *)left,
5880 (PyUnicodeObject *)right);
5881 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5882 (PyUnicode_Check(left) && PyString_Check(right))) {
5883 if (PyUnicode_Check(left))
5884 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5885 if (PyUnicode_Check(right))
5886 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5887 assert(PyString_Check(left));
5888 assert(PyString_Check(right));
5889 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005891 PyErr_Format(PyExc_TypeError,
5892 "Can't compare %.100s and %.100s",
5893 left->ob_type->tp_name,
5894 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 return -1;
5896}
5897
Martin v. Löwis5b222132007-06-10 09:51:05 +00005898int
5899PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
5900{
5901 int i;
5902 Py_UNICODE *id;
5903 assert(PyUnicode_Check(uni));
5904 id = PyUnicode_AS_UNICODE(uni);
5905 /* Compare Unicode string and source character set string */
5906 for (i = 0; id[i] && str[i]; i++)
5907 if (id[i] != str[i])
5908 return ((int)id[i] < (int)str[i]) ? -1 : 1;
5909 if (id[i])
5910 return 1; /* uni is longer */
5911 if (str[i])
5912 return -1; /* str is longer */
5913 return 0;
5914}
5915
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005916PyObject *PyUnicode_RichCompare(PyObject *left,
5917 PyObject *right,
5918 int op)
5919{
5920 int result;
5921
5922 result = PyUnicode_Compare(left, right);
5923 if (result == -1 && PyErr_Occurred())
5924 goto onError;
5925
5926 /* Convert the return value to a Boolean */
5927 switch (op) {
5928 case Py_EQ:
5929 result = (result == 0);
5930 break;
5931 case Py_NE:
5932 result = (result != 0);
5933 break;
5934 case Py_LE:
5935 result = (result <= 0);
5936 break;
5937 case Py_GE:
5938 result = (result >= 0);
5939 break;
5940 case Py_LT:
5941 result = (result == -1);
5942 break;
5943 case Py_GT:
5944 result = (result == 1);
5945 break;
5946 }
5947 return PyBool_FromLong(result);
5948
5949 onError:
5950
5951 /* Standard case
5952
5953 Type errors mean that PyUnicode_FromObject() could not convert
5954 one of the arguments (usually the right hand side) to Unicode,
5955 ie. we can't handle the comparison request. However, it is
5956 possible that the other object knows a comparison method, which
5957 is why we return Py_NotImplemented to give the other object a
5958 chance.
5959
5960 */
5961 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5962 PyErr_Clear();
5963 Py_INCREF(Py_NotImplemented);
5964 return Py_NotImplemented;
5965 }
5966 if (op != Py_EQ && op != Py_NE)
5967 return NULL;
5968
5969 /* Equality comparison.
5970
5971 This is a special case: we silence any PyExc_UnicodeDecodeError
5972 and instead turn it into a PyErr_UnicodeWarning.
5973
5974 */
5975 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5976 return NULL;
5977 PyErr_Clear();
5978 if (PyErr_Warn(PyExc_UnicodeWarning,
5979 (op == Py_EQ) ?
5980 "Unicode equal comparison "
5981 "failed to convert both arguments to Unicode - "
5982 "interpreting them as being unequal" :
5983 "Unicode unequal comparison "
5984 "failed to convert both arguments to Unicode - "
5985 "interpreting them as being unequal"
5986 ) < 0)
5987 return NULL;
5988 result = (op == Py_NE);
5989 return PyBool_FromLong(result);
5990}
5991
Guido van Rossum403d68b2000-03-13 15:55:09 +00005992int PyUnicode_Contains(PyObject *container,
5993 PyObject *element)
5994{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005995 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005996 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005997
5998 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005999 sub = PyUnicode_FromObject(element);
6000 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006001 PyErr_Format(PyExc_TypeError,
6002 "'in <string>' requires string as left operand, not %s",
6003 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006004 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006005 }
6006
Thomas Wouters477c8d52006-05-27 19:21:47 +00006007 str = PyUnicode_FromObject(container);
6008 if (!str) {
6009 Py_DECREF(sub);
6010 return -1;
6011 }
6012
6013 result = stringlib_contains_obj(str, sub);
6014
6015 Py_DECREF(str);
6016 Py_DECREF(sub);
6017
Guido van Rossum403d68b2000-03-13 15:55:09 +00006018 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006019}
6020
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021/* Concat to string or Unicode object giving a new Unicode object. */
6022
6023PyObject *PyUnicode_Concat(PyObject *left,
6024 PyObject *right)
6025{
6026 PyUnicodeObject *u = NULL, *v = NULL, *w;
6027
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006028 if (PyBytes_Check(left) || PyBytes_Check(right))
6029 return PyBytes_Concat(left, right);
6030
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 /* Coerce the two arguments */
6032 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6033 if (u == NULL)
6034 goto onError;
6035 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6036 if (v == NULL)
6037 goto onError;
6038
6039 /* Shortcuts */
6040 if (v == unicode_empty) {
6041 Py_DECREF(v);
6042 return (PyObject *)u;
6043 }
6044 if (u == unicode_empty) {
6045 Py_DECREF(u);
6046 return (PyObject *)v;
6047 }
6048
6049 /* Concat the two Unicode strings */
6050 w = _PyUnicode_New(u->length + v->length);
6051 if (w == NULL)
6052 goto onError;
6053 Py_UNICODE_COPY(w->str, u->str, u->length);
6054 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6055
6056 Py_DECREF(u);
6057 Py_DECREF(v);
6058 return (PyObject *)w;
6059
6060onError:
6061 Py_XDECREF(u);
6062 Py_XDECREF(v);
6063 return NULL;
6064}
6065
Walter Dörwald1ab83302007-05-18 17:15:44 +00006066void
6067PyUnicode_Append(PyObject **pleft, PyObject *right)
6068{
6069 PyObject *new;
6070 if (*pleft == NULL)
6071 return;
6072 if (right == NULL || !PyUnicode_Check(*pleft)) {
6073 Py_DECREF(*pleft);
6074 *pleft = NULL;
6075 return;
6076 }
6077 new = PyUnicode_Concat(*pleft, right);
6078 Py_DECREF(*pleft);
6079 *pleft = new;
6080}
6081
6082void
6083PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6084{
6085 PyUnicode_Append(pleft, right);
6086 Py_XDECREF(right);
6087}
6088
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006089PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090"S.count(sub[, start[, end]]) -> int\n\
6091\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006092Return the number of non-overlapping occurrences of substring sub in\n\
6093Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006094interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095
6096static PyObject *
6097unicode_count(PyUnicodeObject *self, PyObject *args)
6098{
6099 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006100 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006101 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 PyObject *result;
6103
Guido van Rossumb8872e62000-05-09 14:14:27 +00006104 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6105 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106 return NULL;
6107
6108 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006109 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 if (substring == NULL)
6111 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006112
Thomas Wouters477c8d52006-05-27 19:21:47 +00006113 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114
Thomas Wouters477c8d52006-05-27 19:21:47 +00006115 result = PyInt_FromSsize_t(
6116 stringlib_count(self->str + start, end - start,
6117 substring->str, substring->length)
6118 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119
6120 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006121
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 return result;
6123}
6124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006125PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006126"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006128Encodes S using the codec registered for encoding. encoding defaults\n\
6129to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006130handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6132'xmlcharrefreplace' as well as any other name registered with\n\
6133codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134
6135static PyObject *
6136unicode_encode(PyUnicodeObject *self, PyObject *args)
6137{
6138 char *encoding = NULL;
6139 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006140 PyObject *v;
6141
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6143 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006144 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006145 if (v == NULL)
6146 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006147 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006148 if (PyString_Check(v)) {
6149 /* Old codec, turn it into bytes */
6150 PyObject *b = PyBytes_FromObject(v);
6151 Py_DECREF(v);
6152 return b;
6153 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006154 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006155 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006156 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006157 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006158 Py_DECREF(v);
6159 return NULL;
6160 }
6161 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006162
6163 onError:
6164 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006165}
6166
6167PyDoc_STRVAR(decode__doc__,
6168"S.decode([encoding[,errors]]) -> string or unicode\n\
6169\n\
6170Decodes S using the codec registered for encoding. encoding defaults\n\
6171to the default encoding. errors may be given to set a different error\n\
6172handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6173a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6174as well as any other name registerd with codecs.register_error that is\n\
6175able to handle UnicodeDecodeErrors.");
6176
6177static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006178unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006179{
6180 char *encoding = NULL;
6181 char *errors = NULL;
6182 PyObject *v;
6183
6184 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6185 return NULL;
6186 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006187 if (v == NULL)
6188 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006189 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6190 PyErr_Format(PyExc_TypeError,
6191 "decoder did not return a string/unicode object "
6192 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006193 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006194 Py_DECREF(v);
6195 return NULL;
6196 }
6197 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006198
6199 onError:
6200 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201}
6202
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006203PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204"S.expandtabs([tabsize]) -> unicode\n\
6205\n\
6206Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006207If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208
6209static PyObject*
6210unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6211{
6212 Py_UNICODE *e;
6213 Py_UNICODE *p;
6214 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006215 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 PyUnicodeObject *u;
6217 int tabsize = 8;
6218
6219 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6220 return NULL;
6221
Thomas Wouters7e474022000-07-16 12:04:32 +00006222 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006223 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 e = self->str + self->length;
6225 for (p = self->str; p < e; p++)
6226 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006227 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006229 if (old_j > j) {
6230 PyErr_SetString(PyExc_OverflowError,
6231 "new string is too long");
6232 return NULL;
6233 }
6234 old_j = j;
6235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 }
6237 else {
6238 j++;
6239 if (*p == '\n' || *p == '\r') {
6240 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006241 old_j = j = 0;
6242 if (i < 0) {
6243 PyErr_SetString(PyExc_OverflowError,
6244 "new string is too long");
6245 return NULL;
6246 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 }
6248 }
6249
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006250 if ((i + j) < 0) {
6251 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6252 return NULL;
6253 }
6254
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255 /* Second pass: create output string and fill it */
6256 u = _PyUnicode_New(i + j);
6257 if (!u)
6258 return NULL;
6259
6260 j = 0;
6261 q = u->str;
6262
6263 for (p = self->str; p < e; p++)
6264 if (*p == '\t') {
6265 if (tabsize > 0) {
6266 i = tabsize - (j % tabsize);
6267 j += i;
6268 while (i--)
6269 *q++ = ' ';
6270 }
6271 }
6272 else {
6273 j++;
6274 *q++ = *p;
6275 if (*p == '\n' || *p == '\r')
6276 j = 0;
6277 }
6278
6279 return (PyObject*) u;
6280}
6281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006282PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283"S.find(sub [,start [,end]]) -> int\n\
6284\n\
6285Return the lowest index in S where substring sub is found,\n\
6286such that sub is contained within s[start,end]. Optional\n\
6287arguments start and end are interpreted as in slice notation.\n\
6288\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006289Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290
6291static PyObject *
6292unicode_find(PyUnicodeObject *self, PyObject *args)
6293{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006294 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006295 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006296 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006297 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298
Guido van Rossumb8872e62000-05-09 14:14:27 +00006299 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6300 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006302 substring = PyUnicode_FromObject(substring);
6303 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 return NULL;
6305
Thomas Wouters477c8d52006-05-27 19:21:47 +00006306 result = stringlib_find_slice(
6307 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6308 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6309 start, end
6310 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311
6312 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006313
6314 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315}
6316
6317static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006318unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319{
6320 if (index < 0 || index >= self->length) {
6321 PyErr_SetString(PyExc_IndexError, "string index out of range");
6322 return NULL;
6323 }
6324
6325 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6326}
6327
6328static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006329unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006331 /* Since Unicode objects compare equal to their UTF-8 string
6332 counterparts, we hash the UTF-8 string. */
6333 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6334 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335}
6336
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006337PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338"S.index(sub [,start [,end]]) -> int\n\
6339\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006340Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341
6342static PyObject *
6343unicode_index(PyUnicodeObject *self, PyObject *args)
6344{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006345 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006346 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006347 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006348 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349
Guido van Rossumb8872e62000-05-09 14:14:27 +00006350 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6351 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006353 substring = PyUnicode_FromObject(substring);
6354 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355 return NULL;
6356
Thomas Wouters477c8d52006-05-27 19:21:47 +00006357 result = stringlib_find_slice(
6358 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6359 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6360 start, end
6361 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362
6363 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006364
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365 if (result < 0) {
6366 PyErr_SetString(PyExc_ValueError, "substring not found");
6367 return NULL;
6368 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006369
Martin v. Löwis18e16552006-02-15 17:27:45 +00006370 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371}
6372
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006373PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006374"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006376Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006377at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378
6379static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006380unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381{
6382 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6383 register const Py_UNICODE *e;
6384 int cased;
6385
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386 /* Shortcut for single character strings */
6387 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006388 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006390 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006391 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006392 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006393
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394 e = p + PyUnicode_GET_SIZE(self);
6395 cased = 0;
6396 for (; p < e; p++) {
6397 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006398
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006400 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 else if (!cased && Py_UNICODE_ISLOWER(ch))
6402 cased = 1;
6403 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006404 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405}
6406
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006407PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006408"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006410Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006411at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412
6413static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006414unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415{
6416 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6417 register const Py_UNICODE *e;
6418 int cased;
6419
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 /* Shortcut for single character strings */
6421 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006422 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006424 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006425 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006426 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006427
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428 e = p + PyUnicode_GET_SIZE(self);
6429 cased = 0;
6430 for (; p < e; p++) {
6431 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006432
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006434 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435 else if (!cased && Py_UNICODE_ISUPPER(ch))
6436 cased = 1;
6437 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006438 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439}
6440
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006441PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006442"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006444Return True if S is a titlecased string and there is at least one\n\
6445character in S, i.e. upper- and titlecase characters may only\n\
6446follow uncased characters and lowercase characters only cased ones.\n\
6447Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448
6449static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006450unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451{
6452 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6453 register const Py_UNICODE *e;
6454 int cased, previous_is_cased;
6455
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 /* Shortcut for single character strings */
6457 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006458 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6459 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006461 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006462 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006463 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006464
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 e = p + PyUnicode_GET_SIZE(self);
6466 cased = 0;
6467 previous_is_cased = 0;
6468 for (; p < e; p++) {
6469 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006470
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6472 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006473 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474 previous_is_cased = 1;
6475 cased = 1;
6476 }
6477 else if (Py_UNICODE_ISLOWER(ch)) {
6478 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006479 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 previous_is_cased = 1;
6481 cased = 1;
6482 }
6483 else
6484 previous_is_cased = 0;
6485 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006486 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487}
6488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006489PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006490"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006492Return True if all characters in S are whitespace\n\
6493and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494
6495static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006496unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497{
6498 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6499 register const Py_UNICODE *e;
6500
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501 /* Shortcut for single character strings */
6502 if (PyUnicode_GET_SIZE(self) == 1 &&
6503 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006504 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006506 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006507 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006508 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006509
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 e = p + PyUnicode_GET_SIZE(self);
6511 for (; p < e; p++) {
6512 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006513 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006515 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516}
6517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006518PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006519"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006520\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006521Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006522and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006523
6524static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006525unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006526{
6527 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6528 register const Py_UNICODE *e;
6529
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006530 /* Shortcut for single character strings */
6531 if (PyUnicode_GET_SIZE(self) == 1 &&
6532 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006533 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006534
6535 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006536 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006537 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006538
6539 e = p + PyUnicode_GET_SIZE(self);
6540 for (; p < e; p++) {
6541 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006542 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006543 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006544 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006545}
6546
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006547PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006548"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006549\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006550Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006551and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006552
6553static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006554unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006555{
6556 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6557 register const Py_UNICODE *e;
6558
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006559 /* Shortcut for single character strings */
6560 if (PyUnicode_GET_SIZE(self) == 1 &&
6561 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006562 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006563
6564 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006565 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006566 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006567
6568 e = p + PyUnicode_GET_SIZE(self);
6569 for (; p < e; p++) {
6570 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006571 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006572 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006573 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006574}
6575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006576PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006577"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006579Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006580False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581
6582static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006583unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584{
6585 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6586 register const Py_UNICODE *e;
6587
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 /* Shortcut for single character strings */
6589 if (PyUnicode_GET_SIZE(self) == 1 &&
6590 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006591 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006593 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006594 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006595 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006596
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597 e = p + PyUnicode_GET_SIZE(self);
6598 for (; p < e; p++) {
6599 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006600 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006602 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603}
6604
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006605PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006606"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006608Return True if all characters in S are digits\n\
6609and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610
6611static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006612unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613{
6614 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6615 register const Py_UNICODE *e;
6616
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 /* Shortcut for single character strings */
6618 if (PyUnicode_GET_SIZE(self) == 1 &&
6619 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006620 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006622 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006623 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006624 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006625
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626 e = p + PyUnicode_GET_SIZE(self);
6627 for (; p < e; p++) {
6628 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006629 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006631 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632}
6633
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006634PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006635"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006637Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006638False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639
6640static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006641unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642{
6643 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6644 register const Py_UNICODE *e;
6645
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 /* Shortcut for single character strings */
6647 if (PyUnicode_GET_SIZE(self) == 1 &&
6648 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006649 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006651 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006652 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006653 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006654
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655 e = p + PyUnicode_GET_SIZE(self);
6656 for (; p < e; p++) {
6657 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006658 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006660 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661}
6662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006663PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664"S.join(sequence) -> unicode\n\
6665\n\
6666Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006667sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668
6669static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006670unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006672 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673}
6674
Martin v. Löwis18e16552006-02-15 17:27:45 +00006675static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676unicode_length(PyUnicodeObject *self)
6677{
6678 return self->length;
6679}
6680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006681PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006682"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683\n\
6684Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006685done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686
6687static PyObject *
6688unicode_ljust(PyUnicodeObject *self, PyObject *args)
6689{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006690 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006691 Py_UNICODE fillchar = ' ';
6692
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006693 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694 return NULL;
6695
Tim Peters7a29bd52001-09-12 03:03:31 +00006696 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697 Py_INCREF(self);
6698 return (PyObject*) self;
6699 }
6700
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006701 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702}
6703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006704PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705"S.lower() -> unicode\n\
6706\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006707Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708
6709static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006710unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712 return fixup(self, fixlower);
6713}
6714
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006715#define LEFTSTRIP 0
6716#define RIGHTSTRIP 1
6717#define BOTHSTRIP 2
6718
6719/* Arrays indexed by above */
6720static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6721
6722#define STRIPNAME(i) (stripformat[i]+3)
6723
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006724/* externally visible for str.strip(unicode) */
6725PyObject *
6726_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6727{
6728 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006729 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006730 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006731 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6732 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006733
Thomas Wouters477c8d52006-05-27 19:21:47 +00006734 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6735
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006736 i = 0;
6737 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006738 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6739 i++;
6740 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006741 }
6742
6743 j = len;
6744 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006745 do {
6746 j--;
6747 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6748 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006749 }
6750
6751 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006752 Py_INCREF(self);
6753 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006754 }
6755 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006756 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006757}
6758
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759
6760static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006761do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006763 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006764 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006765
6766 i = 0;
6767 if (striptype != RIGHTSTRIP) {
6768 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6769 i++;
6770 }
6771 }
6772
6773 j = len;
6774 if (striptype != LEFTSTRIP) {
6775 do {
6776 j--;
6777 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6778 j++;
6779 }
6780
6781 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6782 Py_INCREF(self);
6783 return (PyObject*)self;
6784 }
6785 else
6786 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787}
6788
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006789
6790static PyObject *
6791do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6792{
6793 PyObject *sep = NULL;
6794
6795 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6796 return NULL;
6797
6798 if (sep != NULL && sep != Py_None) {
6799 if (PyUnicode_Check(sep))
6800 return _PyUnicode_XStrip(self, striptype, sep);
6801 else if (PyString_Check(sep)) {
6802 PyObject *res;
6803 sep = PyUnicode_FromObject(sep);
6804 if (sep==NULL)
6805 return NULL;
6806 res = _PyUnicode_XStrip(self, striptype, sep);
6807 Py_DECREF(sep);
6808 return res;
6809 }
6810 else {
6811 PyErr_Format(PyExc_TypeError,
6812 "%s arg must be None, unicode or str",
6813 STRIPNAME(striptype));
6814 return NULL;
6815 }
6816 }
6817
6818 return do_strip(self, striptype);
6819}
6820
6821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006822PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006823"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006824\n\
6825Return a copy of the string S with leading and trailing\n\
6826whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006827If chars is given and not None, remove characters in chars instead.\n\
6828If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006829
6830static PyObject *
6831unicode_strip(PyUnicodeObject *self, PyObject *args)
6832{
6833 if (PyTuple_GET_SIZE(args) == 0)
6834 return do_strip(self, BOTHSTRIP); /* Common case */
6835 else
6836 return do_argstrip(self, BOTHSTRIP, args);
6837}
6838
6839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006840PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006841"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006842\n\
6843Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006844If chars is given and not None, remove characters in chars instead.\n\
6845If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006846
6847static PyObject *
6848unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6849{
6850 if (PyTuple_GET_SIZE(args) == 0)
6851 return do_strip(self, LEFTSTRIP); /* Common case */
6852 else
6853 return do_argstrip(self, LEFTSTRIP, args);
6854}
6855
6856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006857PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006858"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006859\n\
6860Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006861If chars is given and not None, remove characters in chars instead.\n\
6862If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006863
6864static PyObject *
6865unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6866{
6867 if (PyTuple_GET_SIZE(args) == 0)
6868 return do_strip(self, RIGHTSTRIP); /* Common case */
6869 else
6870 return do_argstrip(self, RIGHTSTRIP, args);
6871}
6872
6873
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006875unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876{
6877 PyUnicodeObject *u;
6878 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006879 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006880 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881
6882 if (len < 0)
6883 len = 0;
6884
Tim Peters7a29bd52001-09-12 03:03:31 +00006885 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886 /* no repeat, return original string */
6887 Py_INCREF(str);
6888 return (PyObject*) str;
6889 }
Tim Peters8f422462000-09-09 06:13:41 +00006890
6891 /* ensure # of chars needed doesn't overflow int and # of bytes
6892 * needed doesn't overflow size_t
6893 */
6894 nchars = len * str->length;
6895 if (len && nchars / len != str->length) {
6896 PyErr_SetString(PyExc_OverflowError,
6897 "repeated string is too long");
6898 return NULL;
6899 }
6900 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6901 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6902 PyErr_SetString(PyExc_OverflowError,
6903 "repeated string is too long");
6904 return NULL;
6905 }
6906 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 if (!u)
6908 return NULL;
6909
6910 p = u->str;
6911
Thomas Wouters477c8d52006-05-27 19:21:47 +00006912 if (str->length == 1 && len > 0) {
6913 Py_UNICODE_FILL(p, str->str[0], len);
6914 } else {
6915 Py_ssize_t done = 0; /* number of characters copied this far */
6916 if (done < nchars) {
6917 Py_UNICODE_COPY(p, str->str, str->length);
6918 done = str->length;
6919 }
6920 while (done < nchars) {
6921 int n = (done <= nchars-done) ? done : nchars-done;
6922 Py_UNICODE_COPY(p+done, p, n);
6923 done += n;
6924 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 }
6926
6927 return (PyObject*) u;
6928}
6929
6930PyObject *PyUnicode_Replace(PyObject *obj,
6931 PyObject *subobj,
6932 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006933 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934{
6935 PyObject *self;
6936 PyObject *str1;
6937 PyObject *str2;
6938 PyObject *result;
6939
6940 self = PyUnicode_FromObject(obj);
6941 if (self == NULL)
6942 return NULL;
6943 str1 = PyUnicode_FromObject(subobj);
6944 if (str1 == NULL) {
6945 Py_DECREF(self);
6946 return NULL;
6947 }
6948 str2 = PyUnicode_FromObject(replobj);
6949 if (str2 == NULL) {
6950 Py_DECREF(self);
6951 Py_DECREF(str1);
6952 return NULL;
6953 }
Tim Petersced69f82003-09-16 20:30:58 +00006954 result = replace((PyUnicodeObject *)self,
6955 (PyUnicodeObject *)str1,
6956 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957 maxcount);
6958 Py_DECREF(self);
6959 Py_DECREF(str1);
6960 Py_DECREF(str2);
6961 return result;
6962}
6963
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006964PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965"S.replace (old, new[, maxsplit]) -> unicode\n\
6966\n\
6967Return a copy of S with all occurrences of substring\n\
6968old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006969given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970
6971static PyObject*
6972unicode_replace(PyUnicodeObject *self, PyObject *args)
6973{
6974 PyUnicodeObject *str1;
6975 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006976 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977 PyObject *result;
6978
Martin v. Löwis18e16552006-02-15 17:27:45 +00006979 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980 return NULL;
6981 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6982 if (str1 == NULL)
6983 return NULL;
6984 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006985 if (str2 == NULL) {
6986 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006988 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989
6990 result = replace(self, str1, str2, maxcount);
6991
6992 Py_DECREF(str1);
6993 Py_DECREF(str2);
6994 return result;
6995}
6996
6997static
6998PyObject *unicode_repr(PyObject *unicode)
6999{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007000 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007001 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007002 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7003 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7004
7005 /* XXX(nnorwitz): rather than over-allocating, it would be
7006 better to choose a different scheme. Perhaps scan the
7007 first N-chars of the string and allocate based on that size.
7008 */
7009 /* Initial allocation is based on the longest-possible unichr
7010 escape.
7011
7012 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7013 unichr, so in this case it's the longest unichr escape. In
7014 narrow (UTF-16) builds this is five chars per source unichr
7015 since there are two unichrs in the surrogate pair, so in narrow
7016 (UTF-16) builds it's not the longest unichr escape.
7017
7018 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7019 so in the narrow (UTF-16) build case it's the longest unichr
7020 escape.
7021 */
7022
Walter Dörwald1ab83302007-05-18 17:15:44 +00007023 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007024 2 /* quotes */
7025#ifdef Py_UNICODE_WIDE
7026 + 10*size
7027#else
7028 + 6*size
7029#endif
7030 + 1);
7031 if (repr == NULL)
7032 return NULL;
7033
Walter Dörwald1ab83302007-05-18 17:15:44 +00007034 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007035
7036 /* Add quote */
7037 *p++ = (findchar(s, size, '\'') &&
7038 !findchar(s, size, '"')) ? '"' : '\'';
7039 while (size-- > 0) {
7040 Py_UNICODE ch = *s++;
7041
7042 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007043 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007044 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007045 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007046 continue;
7047 }
7048
7049#ifdef Py_UNICODE_WIDE
7050 /* Map 21-bit characters to '\U00xxxxxx' */
7051 else if (ch >= 0x10000) {
7052 *p++ = '\\';
7053 *p++ = 'U';
7054 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7055 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7056 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7057 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7058 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7059 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7060 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7061 *p++ = hexdigits[ch & 0x0000000F];
7062 continue;
7063 }
7064#else
7065 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7066 else if (ch >= 0xD800 && ch < 0xDC00) {
7067 Py_UNICODE ch2;
7068 Py_UCS4 ucs;
7069
7070 ch2 = *s++;
7071 size--;
7072 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7073 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7074 *p++ = '\\';
7075 *p++ = 'U';
7076 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7077 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7078 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7079 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7080 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7081 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7082 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7083 *p++ = hexdigits[ucs & 0x0000000F];
7084 continue;
7085 }
7086 /* Fall through: isolated surrogates are copied as-is */
7087 s--;
7088 size++;
7089 }
7090#endif
7091
7092 /* Map 16-bit characters to '\uxxxx' */
7093 if (ch >= 256) {
7094 *p++ = '\\';
7095 *p++ = 'u';
7096 *p++ = hexdigits[(ch >> 12) & 0x000F];
7097 *p++ = hexdigits[(ch >> 8) & 0x000F];
7098 *p++ = hexdigits[(ch >> 4) & 0x000F];
7099 *p++ = hexdigits[ch & 0x000F];
7100 }
7101
7102 /* Map special whitespace to '\t', \n', '\r' */
7103 else if (ch == '\t') {
7104 *p++ = '\\';
7105 *p++ = 't';
7106 }
7107 else if (ch == '\n') {
7108 *p++ = '\\';
7109 *p++ = 'n';
7110 }
7111 else if (ch == '\r') {
7112 *p++ = '\\';
7113 *p++ = 'r';
7114 }
7115
7116 /* Map non-printable US ASCII to '\xhh' */
7117 else if (ch < ' ' || ch >= 0x7F) {
7118 *p++ = '\\';
7119 *p++ = 'x';
7120 *p++ = hexdigits[(ch >> 4) & 0x000F];
7121 *p++ = hexdigits[ch & 0x000F];
7122 }
7123
7124 /* Copy everything else as-is */
7125 else
7126 *p++ = (char) ch;
7127 }
7128 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007129 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007130
7131 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007132 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007133 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134}
7135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007136PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137"S.rfind(sub [,start [,end]]) -> int\n\
7138\n\
7139Return the highest index in S where substring sub is found,\n\
7140such that sub is contained within s[start,end]. Optional\n\
7141arguments start and end are interpreted as in slice notation.\n\
7142\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007143Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144
7145static PyObject *
7146unicode_rfind(PyUnicodeObject *self, PyObject *args)
7147{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007148 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007149 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007150 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007151 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152
Guido van Rossumb8872e62000-05-09 14:14:27 +00007153 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7154 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007156 substring = PyUnicode_FromObject(substring);
7157 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158 return NULL;
7159
Thomas Wouters477c8d52006-05-27 19:21:47 +00007160 result = stringlib_rfind_slice(
7161 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7162 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7163 start, end
7164 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165
7166 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007167
7168 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169}
7170
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007171PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172"S.rindex(sub [,start [,end]]) -> int\n\
7173\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007174Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175
7176static PyObject *
7177unicode_rindex(PyUnicodeObject *self, PyObject *args)
7178{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007179 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007180 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007181 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007182 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183
Guido van Rossumb8872e62000-05-09 14:14:27 +00007184 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7185 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007187 substring = PyUnicode_FromObject(substring);
7188 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189 return NULL;
7190
Thomas Wouters477c8d52006-05-27 19:21:47 +00007191 result = stringlib_rfind_slice(
7192 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7193 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7194 start, end
7195 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196
7197 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007198
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199 if (result < 0) {
7200 PyErr_SetString(PyExc_ValueError, "substring not found");
7201 return NULL;
7202 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007203 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204}
7205
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007206PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007207"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208\n\
7209Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007210done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211
7212static PyObject *
7213unicode_rjust(PyUnicodeObject *self, PyObject *args)
7214{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007215 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007216 Py_UNICODE fillchar = ' ';
7217
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007218 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219 return NULL;
7220
Tim Peters7a29bd52001-09-12 03:03:31 +00007221 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222 Py_INCREF(self);
7223 return (PyObject*) self;
7224 }
7225
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007226 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227}
7228
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007230unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231{
7232 /* standard clamping */
7233 if (start < 0)
7234 start = 0;
7235 if (end < 0)
7236 end = 0;
7237 if (end > self->length)
7238 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007239 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240 /* full slice, return original string */
7241 Py_INCREF(self);
7242 return (PyObject*) self;
7243 }
7244 if (start > end)
7245 start = end;
7246 /* copy slice */
7247 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7248 end - start);
7249}
7250
7251PyObject *PyUnicode_Split(PyObject *s,
7252 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007253 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254{
7255 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007256
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257 s = PyUnicode_FromObject(s);
7258 if (s == NULL)
7259 return NULL;
7260 if (sep != NULL) {
7261 sep = PyUnicode_FromObject(sep);
7262 if (sep == NULL) {
7263 Py_DECREF(s);
7264 return NULL;
7265 }
7266 }
7267
7268 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7269
7270 Py_DECREF(s);
7271 Py_XDECREF(sep);
7272 return result;
7273}
7274
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007275PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276"S.split([sep [,maxsplit]]) -> list of strings\n\
7277\n\
7278Return a list of the words in S, using sep as the\n\
7279delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007280splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007281any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282
7283static PyObject*
7284unicode_split(PyUnicodeObject *self, PyObject *args)
7285{
7286 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007287 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288
Martin v. Löwis18e16552006-02-15 17:27:45 +00007289 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290 return NULL;
7291
7292 if (substring == Py_None)
7293 return split(self, NULL, maxcount);
7294 else if (PyUnicode_Check(substring))
7295 return split(self, (PyUnicodeObject *)substring, maxcount);
7296 else
7297 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7298}
7299
Thomas Wouters477c8d52006-05-27 19:21:47 +00007300PyObject *
7301PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7302{
7303 PyObject* str_obj;
7304 PyObject* sep_obj;
7305 PyObject* out;
7306
7307 str_obj = PyUnicode_FromObject(str_in);
7308 if (!str_obj)
7309 return NULL;
7310 sep_obj = PyUnicode_FromObject(sep_in);
7311 if (!sep_obj) {
7312 Py_DECREF(str_obj);
7313 return NULL;
7314 }
7315
7316 out = stringlib_partition(
7317 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7318 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7319 );
7320
7321 Py_DECREF(sep_obj);
7322 Py_DECREF(str_obj);
7323
7324 return out;
7325}
7326
7327
7328PyObject *
7329PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7330{
7331 PyObject* str_obj;
7332 PyObject* sep_obj;
7333 PyObject* out;
7334
7335 str_obj = PyUnicode_FromObject(str_in);
7336 if (!str_obj)
7337 return NULL;
7338 sep_obj = PyUnicode_FromObject(sep_in);
7339 if (!sep_obj) {
7340 Py_DECREF(str_obj);
7341 return NULL;
7342 }
7343
7344 out = stringlib_rpartition(
7345 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7346 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7347 );
7348
7349 Py_DECREF(sep_obj);
7350 Py_DECREF(str_obj);
7351
7352 return out;
7353}
7354
7355PyDoc_STRVAR(partition__doc__,
7356"S.partition(sep) -> (head, sep, tail)\n\
7357\n\
7358Searches for the separator sep in S, and returns the part before it,\n\
7359the separator itself, and the part after it. If the separator is not\n\
7360found, returns S and two empty strings.");
7361
7362static PyObject*
7363unicode_partition(PyUnicodeObject *self, PyObject *separator)
7364{
7365 return PyUnicode_Partition((PyObject *)self, separator);
7366}
7367
7368PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007369"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007370\n\
7371Searches for the separator sep in S, starting at the end of S, and returns\n\
7372the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007373separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007374
7375static PyObject*
7376unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7377{
7378 return PyUnicode_RPartition((PyObject *)self, separator);
7379}
7380
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007381PyObject *PyUnicode_RSplit(PyObject *s,
7382 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007383 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007384{
7385 PyObject *result;
7386
7387 s = PyUnicode_FromObject(s);
7388 if (s == NULL)
7389 return NULL;
7390 if (sep != NULL) {
7391 sep = PyUnicode_FromObject(sep);
7392 if (sep == NULL) {
7393 Py_DECREF(s);
7394 return NULL;
7395 }
7396 }
7397
7398 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7399
7400 Py_DECREF(s);
7401 Py_XDECREF(sep);
7402 return result;
7403}
7404
7405PyDoc_STRVAR(rsplit__doc__,
7406"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7407\n\
7408Return a list of the words in S, using sep as the\n\
7409delimiter string, starting at the end of the string and\n\
7410working to the front. If maxsplit is given, at most maxsplit\n\
7411splits are done. If sep is not specified, any whitespace string\n\
7412is a separator.");
7413
7414static PyObject*
7415unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7416{
7417 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007418 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007419
Martin v. Löwis18e16552006-02-15 17:27:45 +00007420 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007421 return NULL;
7422
7423 if (substring == Py_None)
7424 return rsplit(self, NULL, maxcount);
7425 else if (PyUnicode_Check(substring))
7426 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7427 else
7428 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7429}
7430
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007431PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007432"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433\n\
7434Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007435Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007436is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437
7438static PyObject*
7439unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7440{
Guido van Rossum86662912000-04-11 15:38:46 +00007441 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442
Guido van Rossum86662912000-04-11 15:38:46 +00007443 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444 return NULL;
7445
Guido van Rossum86662912000-04-11 15:38:46 +00007446 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447}
7448
7449static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007450PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451{
Walter Dörwald346737f2007-05-31 10:44:43 +00007452 if (PyUnicode_CheckExact(self)) {
7453 Py_INCREF(self);
7454 return self;
7455 } else
7456 /* Subtype -- return genuine unicode string with the same value. */
7457 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7458 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459}
7460
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007461PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462"S.swapcase() -> unicode\n\
7463\n\
7464Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007465and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466
7467static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007468unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470 return fixup(self, fixswapcase);
7471}
7472
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007473PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474"S.translate(table) -> unicode\n\
7475\n\
7476Return a copy of the string S, where all characters have been mapped\n\
7477through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007478Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7479Unmapped characters are left untouched. Characters mapped to None\n\
7480are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481
7482static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007483unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484{
Tim Petersced69f82003-09-16 20:30:58 +00007485 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007487 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488 "ignore");
7489}
7490
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007491PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492"S.upper() -> unicode\n\
7493\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007494Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495
7496static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007497unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499 return fixup(self, fixupper);
7500}
7501
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007502PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503"S.zfill(width) -> unicode\n\
7504\n\
7505Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007506of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507
7508static PyObject *
7509unicode_zfill(PyUnicodeObject *self, PyObject *args)
7510{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007511 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512 PyUnicodeObject *u;
7513
Martin v. Löwis18e16552006-02-15 17:27:45 +00007514 Py_ssize_t width;
7515 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516 return NULL;
7517
7518 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007519 if (PyUnicode_CheckExact(self)) {
7520 Py_INCREF(self);
7521 return (PyObject*) self;
7522 }
7523 else
7524 return PyUnicode_FromUnicode(
7525 PyUnicode_AS_UNICODE(self),
7526 PyUnicode_GET_SIZE(self)
7527 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528 }
7529
7530 fill = width - self->length;
7531
7532 u = pad(self, fill, 0, '0');
7533
Walter Dörwald068325e2002-04-15 13:36:47 +00007534 if (u == NULL)
7535 return NULL;
7536
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537 if (u->str[fill] == '+' || u->str[fill] == '-') {
7538 /* move sign to beginning of string */
7539 u->str[0] = u->str[fill];
7540 u->str[fill] = '0';
7541 }
7542
7543 return (PyObject*) u;
7544}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545
7546#if 0
7547static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007548unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550 return PyInt_FromLong(unicode_freelist_size);
7551}
7552#endif
7553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007554PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007555"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007557Return True if S starts with the specified prefix, False otherwise.\n\
7558With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007559With optional end, stop comparing S at that position.\n\
7560prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561
7562static PyObject *
7563unicode_startswith(PyUnicodeObject *self,
7564 PyObject *args)
7565{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007566 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007568 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007569 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007570 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007572 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007573 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007575 if (PyTuple_Check(subobj)) {
7576 Py_ssize_t i;
7577 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7578 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7579 PyTuple_GET_ITEM(subobj, i));
7580 if (substring == NULL)
7581 return NULL;
7582 result = tailmatch(self, substring, start, end, -1);
7583 Py_DECREF(substring);
7584 if (result) {
7585 Py_RETURN_TRUE;
7586 }
7587 }
7588 /* nothing matched */
7589 Py_RETURN_FALSE;
7590 }
7591 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007593 return NULL;
7594 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007596 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597}
7598
7599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007600PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007601"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007603Return True if S ends with the specified suffix, False otherwise.\n\
7604With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007605With optional end, stop comparing S at that position.\n\
7606suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607
7608static PyObject *
7609unicode_endswith(PyUnicodeObject *self,
7610 PyObject *args)
7611{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007612 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007614 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007615 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007616 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007618 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7619 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007621 if (PyTuple_Check(subobj)) {
7622 Py_ssize_t i;
7623 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7624 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7625 PyTuple_GET_ITEM(subobj, i));
7626 if (substring == NULL)
7627 return NULL;
7628 result = tailmatch(self, substring, start, end, +1);
7629 Py_DECREF(substring);
7630 if (result) {
7631 Py_RETURN_TRUE;
7632 }
7633 }
7634 Py_RETURN_FALSE;
7635 }
7636 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007640 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007642 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643}
7644
7645
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007646
7647static PyObject *
7648unicode_getnewargs(PyUnicodeObject *v)
7649{
7650 return Py_BuildValue("(u#)", v->str, v->length);
7651}
7652
7653
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654static PyMethodDef unicode_methods[] = {
7655
7656 /* Order is according to common usage: often used methods should
7657 appear first, since lookup is done sequentially. */
7658
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007659 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7660 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7661 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007662 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007663 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7664 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7665 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7666 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7667 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7668 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7669 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007670 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007671 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7672 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7673 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007674 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007675 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007676/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7677 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7678 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7679 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007680 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007681 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007682 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007683 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007684 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7685 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7686 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7687 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7688 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7689 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7690 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7691 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7692 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7693 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7694 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7695 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7696 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7697 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007698 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007699#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007700 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701#endif
7702
7703#if 0
7704 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007705 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706#endif
7707
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007708 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709 {NULL, NULL}
7710};
7711
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007712static PyObject *
7713unicode_mod(PyObject *v, PyObject *w)
7714{
7715 if (!PyUnicode_Check(v)) {
7716 Py_INCREF(Py_NotImplemented);
7717 return Py_NotImplemented;
7718 }
7719 return PyUnicode_Format(v, w);
7720}
7721
7722static PyNumberMethods unicode_as_number = {
7723 0, /*nb_add*/
7724 0, /*nb_subtract*/
7725 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007726 unicode_mod, /*nb_remainder*/
7727};
7728
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007730 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007731 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007732 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7733 (ssizeargfunc) unicode_getitem, /* sq_item */
7734 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735 0, /* sq_ass_item */
7736 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007737 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738};
7739
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007740static PyObject*
7741unicode_subscript(PyUnicodeObject* self, PyObject* item)
7742{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007743 if (PyIndex_Check(item)) {
7744 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007745 if (i == -1 && PyErr_Occurred())
7746 return NULL;
7747 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007748 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007749 return unicode_getitem(self, i);
7750 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007751 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007752 Py_UNICODE* source_buf;
7753 Py_UNICODE* result_buf;
7754 PyObject* result;
7755
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007756 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007757 &start, &stop, &step, &slicelength) < 0) {
7758 return NULL;
7759 }
7760
7761 if (slicelength <= 0) {
7762 return PyUnicode_FromUnicode(NULL, 0);
7763 } else {
7764 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007765 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7766 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007767
7768 if (result_buf == NULL)
7769 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007770
7771 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7772 result_buf[i] = source_buf[cur];
7773 }
Tim Petersced69f82003-09-16 20:30:58 +00007774
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007775 result = PyUnicode_FromUnicode(result_buf, slicelength);
7776 PyMem_FREE(result_buf);
7777 return result;
7778 }
7779 } else {
7780 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7781 return NULL;
7782 }
7783}
7784
7785static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007786 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007787 (binaryfunc)unicode_subscript, /* mp_subscript */
7788 (objobjargproc)0, /* mp_ass_subscript */
7789};
7790
Martin v. Löwis18e16552006-02-15 17:27:45 +00007791static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007793 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794 const void **ptr)
7795{
7796 if (index != 0) {
7797 PyErr_SetString(PyExc_SystemError,
7798 "accessing non-existent unicode segment");
7799 return -1;
7800 }
7801 *ptr = (void *) self->str;
7802 return PyUnicode_GET_DATA_SIZE(self);
7803}
7804
Martin v. Löwis18e16552006-02-15 17:27:45 +00007805static Py_ssize_t
7806unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807 const void **ptr)
7808{
7809 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007810 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811 return -1;
7812}
7813
7814static int
7815unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007816 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817{
7818 if (lenp)
7819 *lenp = PyUnicode_GET_DATA_SIZE(self);
7820 return 1;
7821}
7822
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007823static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007825 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826 const void **ptr)
7827{
7828 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007829
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830 if (index != 0) {
7831 PyErr_SetString(PyExc_SystemError,
7832 "accessing non-existent unicode segment");
7833 return -1;
7834 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007835 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836 if (str == NULL)
7837 return -1;
7838 *ptr = (void *) PyString_AS_STRING(str);
7839 return PyString_GET_SIZE(str);
7840}
7841
7842/* Helpers for PyUnicode_Format() */
7843
7844static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007845getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007847 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848 if (argidx < arglen) {
7849 (*p_argidx)++;
7850 if (arglen < 0)
7851 return args;
7852 else
7853 return PyTuple_GetItem(args, argidx);
7854 }
7855 PyErr_SetString(PyExc_TypeError,
7856 "not enough arguments for format string");
7857 return NULL;
7858}
7859
7860#define F_LJUST (1<<0)
7861#define F_SIGN (1<<1)
7862#define F_BLANK (1<<2)
7863#define F_ALT (1<<3)
7864#define F_ZERO (1<<4)
7865
Martin v. Löwis18e16552006-02-15 17:27:45 +00007866static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007867strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007869 register Py_ssize_t i;
7870 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871 for (i = len - 1; i >= 0; i--)
7872 buffer[i] = (Py_UNICODE) charbuffer[i];
7873
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 return len;
7875}
7876
Neal Norwitzfc76d632006-01-10 06:03:13 +00007877static int
7878doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7879{
Tim Peters15231542006-02-16 01:08:01 +00007880 Py_ssize_t result;
7881
Neal Norwitzfc76d632006-01-10 06:03:13 +00007882 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007883 result = strtounicode(buffer, (char *)buffer);
7884 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007885}
7886
7887static int
7888longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7889{
Tim Peters15231542006-02-16 01:08:01 +00007890 Py_ssize_t result;
7891
Neal Norwitzfc76d632006-01-10 06:03:13 +00007892 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007893 result = strtounicode(buffer, (char *)buffer);
7894 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007895}
7896
Guido van Rossum078151d2002-08-11 04:24:12 +00007897/* XXX To save some code duplication, formatfloat/long/int could have been
7898 shared with stringobject.c, converting from 8-bit to Unicode after the
7899 formatting is done. */
7900
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901static int
7902formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007903 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904 int flags,
7905 int prec,
7906 int type,
7907 PyObject *v)
7908{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007909 /* fmt = '%#.' + `prec` + `type`
7910 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007911 char fmt[20];
7912 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007913
Guido van Rossumd57fd912000-03-10 22:53:23 +00007914 x = PyFloat_AsDouble(v);
7915 if (x == -1.0 && PyErr_Occurred())
7916 return -1;
7917 if (prec < 0)
7918 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7920 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007921 /* Worst case length calc to ensure no buffer overrun:
7922
7923 'g' formats:
7924 fmt = %#.<prec>g
7925 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7926 for any double rep.)
7927 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7928
7929 'f' formats:
7930 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7931 len = 1 + 50 + 1 + prec = 52 + prec
7932
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007933 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007934 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007935
7936 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00007937 if (((type == 'g' || type == 'G') &&
7938 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007939 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007940 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007941 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007942 return -1;
7943 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007944 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7945 (flags&F_ALT) ? "#" : "",
7946 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007947 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948}
7949
Tim Peters38fd5b62000-09-21 05:43:11 +00007950static PyObject*
7951formatlong(PyObject *val, int flags, int prec, int type)
7952{
7953 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00007954 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00007955 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00007956 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00007957
7958 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7959 if (!str)
7960 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00007961 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00007962 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00007963 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00007964}
7965
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966static int
7967formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007968 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969 int flags,
7970 int prec,
7971 int type,
7972 PyObject *v)
7973{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007974 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007975 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7976 * + 1 + 1
7977 * = 24
7978 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007979 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007980 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 long x;
7982
7983 x = PyInt_AsLong(v);
7984 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007985 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007986 if (x < 0 && type == 'u') {
7987 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007988 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007989 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7990 sign = "-";
7991 else
7992 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007994 prec = 1;
7995
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007996 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7997 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007998 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007999 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008000 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008001 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008002 return -1;
8003 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008004
8005 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008006 (type == 'x' || type == 'X' || type == 'o')) {
8007 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008008 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008009 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008010 * - when 0 is being converted, the C standard leaves off
8011 * the '0x' or '0X', which is inconsistent with other
8012 * %#x/%#X conversions and inconsistent with Python's
8013 * hex() function
8014 * - there are platforms that violate the standard and
8015 * convert 0 with the '0x' or '0X'
8016 * (Metrowerks, Compaq Tru64)
8017 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008018 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008019 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008020 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008021 * We can achieve the desired consistency by inserting our
8022 * own '0x' or '0X' prefix, and substituting %x/%X in place
8023 * of %#x/%#X.
8024 *
8025 * Note that this is the same approach as used in
8026 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008027 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008028 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8029 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008030 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008031 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008032 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8033 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008034 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008035 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008036 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008037 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008038 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008039 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040}
8041
8042static int
8043formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008044 size_t buflen,
8045 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008047 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008048 if (PyUnicode_Check(v)) {
8049 if (PyUnicode_GET_SIZE(v) != 1)
8050 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008052 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008054 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008055 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008056 goto onError;
8057 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008059
8060 else {
8061 /* Integer input truncated to a character */
8062 long x;
8063 x = PyInt_AsLong(v);
8064 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008065 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008066#ifdef Py_UNICODE_WIDE
8067 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008068 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008069 "%c arg not in range(0x110000) "
8070 "(wide Python build)");
8071 return -1;
8072 }
8073#else
8074 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008075 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008076 "%c arg not in range(0x10000) "
8077 "(narrow Python build)");
8078 return -1;
8079 }
8080#endif
8081 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082 }
8083 buf[1] = '\0';
8084 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008085
8086 onError:
8087 PyErr_SetString(PyExc_TypeError,
8088 "%c requires int or char");
8089 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090}
8091
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008092/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8093
8094 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8095 chars are formatted. XXX This is a magic number. Each formatting
8096 routine does bounds checking to ensure no overflow, but a better
8097 solution may be to malloc a buffer of appropriate size for each
8098 format. For now, the current solution is sufficient.
8099*/
8100#define FORMATBUFLEN (size_t)120
8101
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102PyObject *PyUnicode_Format(PyObject *format,
8103 PyObject *args)
8104{
8105 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008106 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107 int args_owned = 0;
8108 PyUnicodeObject *result = NULL;
8109 PyObject *dict = NULL;
8110 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008111
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112 if (format == NULL || args == NULL) {
8113 PyErr_BadInternalCall();
8114 return NULL;
8115 }
8116 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008117 if (uformat == NULL)
8118 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119 fmt = PyUnicode_AS_UNICODE(uformat);
8120 fmtcnt = PyUnicode_GET_SIZE(uformat);
8121
8122 reslen = rescnt = fmtcnt + 100;
8123 result = _PyUnicode_New(reslen);
8124 if (result == NULL)
8125 goto onError;
8126 res = PyUnicode_AS_UNICODE(result);
8127
8128 if (PyTuple_Check(args)) {
8129 arglen = PyTuple_Size(args);
8130 argidx = 0;
8131 }
8132 else {
8133 arglen = -1;
8134 argidx = -2;
8135 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008136 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008137 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138 dict = args;
8139
8140 while (--fmtcnt >= 0) {
8141 if (*fmt != '%') {
8142 if (--rescnt < 0) {
8143 rescnt = fmtcnt + 100;
8144 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008145 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008146 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8148 --rescnt;
8149 }
8150 *res++ = *fmt++;
8151 }
8152 else {
8153 /* Got a format specifier */
8154 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008155 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157 Py_UNICODE c = '\0';
8158 Py_UNICODE fill;
8159 PyObject *v = NULL;
8160 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008161 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008163 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008164 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008165
8166 fmt++;
8167 if (*fmt == '(') {
8168 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008169 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008170 PyObject *key;
8171 int pcount = 1;
8172
8173 if (dict == NULL) {
8174 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008175 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176 goto onError;
8177 }
8178 ++fmt;
8179 --fmtcnt;
8180 keystart = fmt;
8181 /* Skip over balanced parentheses */
8182 while (pcount > 0 && --fmtcnt >= 0) {
8183 if (*fmt == ')')
8184 --pcount;
8185 else if (*fmt == '(')
8186 ++pcount;
8187 fmt++;
8188 }
8189 keylen = fmt - keystart - 1;
8190 if (fmtcnt < 0 || pcount > 0) {
8191 PyErr_SetString(PyExc_ValueError,
8192 "incomplete format key");
8193 goto onError;
8194 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008195#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008196 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197 then looked up since Python uses strings to hold
8198 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008199 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200 key = PyUnicode_EncodeUTF8(keystart,
8201 keylen,
8202 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008203#else
8204 key = PyUnicode_FromUnicode(keystart, keylen);
8205#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206 if (key == NULL)
8207 goto onError;
8208 if (args_owned) {
8209 Py_DECREF(args);
8210 args_owned = 0;
8211 }
8212 args = PyObject_GetItem(dict, key);
8213 Py_DECREF(key);
8214 if (args == NULL) {
8215 goto onError;
8216 }
8217 args_owned = 1;
8218 arglen = -1;
8219 argidx = -2;
8220 }
8221 while (--fmtcnt >= 0) {
8222 switch (c = *fmt++) {
8223 case '-': flags |= F_LJUST; continue;
8224 case '+': flags |= F_SIGN; continue;
8225 case ' ': flags |= F_BLANK; continue;
8226 case '#': flags |= F_ALT; continue;
8227 case '0': flags |= F_ZERO; continue;
8228 }
8229 break;
8230 }
8231 if (c == '*') {
8232 v = getnextarg(args, arglen, &argidx);
8233 if (v == NULL)
8234 goto onError;
8235 if (!PyInt_Check(v)) {
8236 PyErr_SetString(PyExc_TypeError,
8237 "* wants int");
8238 goto onError;
8239 }
8240 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008241 if (width == -1 && PyErr_Occurred())
8242 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243 if (width < 0) {
8244 flags |= F_LJUST;
8245 width = -width;
8246 }
8247 if (--fmtcnt >= 0)
8248 c = *fmt++;
8249 }
8250 else if (c >= '0' && c <= '9') {
8251 width = c - '0';
8252 while (--fmtcnt >= 0) {
8253 c = *fmt++;
8254 if (c < '0' || c > '9')
8255 break;
8256 if ((width*10) / 10 != width) {
8257 PyErr_SetString(PyExc_ValueError,
8258 "width too big");
8259 goto onError;
8260 }
8261 width = width*10 + (c - '0');
8262 }
8263 }
8264 if (c == '.') {
8265 prec = 0;
8266 if (--fmtcnt >= 0)
8267 c = *fmt++;
8268 if (c == '*') {
8269 v = getnextarg(args, arglen, &argidx);
8270 if (v == NULL)
8271 goto onError;
8272 if (!PyInt_Check(v)) {
8273 PyErr_SetString(PyExc_TypeError,
8274 "* wants int");
8275 goto onError;
8276 }
8277 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008278 if (prec == -1 && PyErr_Occurred())
8279 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280 if (prec < 0)
8281 prec = 0;
8282 if (--fmtcnt >= 0)
8283 c = *fmt++;
8284 }
8285 else if (c >= '0' && c <= '9') {
8286 prec = c - '0';
8287 while (--fmtcnt >= 0) {
8288 c = Py_CHARMASK(*fmt++);
8289 if (c < '0' || c > '9')
8290 break;
8291 if ((prec*10) / 10 != prec) {
8292 PyErr_SetString(PyExc_ValueError,
8293 "prec too big");
8294 goto onError;
8295 }
8296 prec = prec*10 + (c - '0');
8297 }
8298 }
8299 } /* prec */
8300 if (fmtcnt >= 0) {
8301 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 if (--fmtcnt >= 0)
8303 c = *fmt++;
8304 }
8305 }
8306 if (fmtcnt < 0) {
8307 PyErr_SetString(PyExc_ValueError,
8308 "incomplete format");
8309 goto onError;
8310 }
8311 if (c != '%') {
8312 v = getnextarg(args, arglen, &argidx);
8313 if (v == NULL)
8314 goto onError;
8315 }
8316 sign = 0;
8317 fill = ' ';
8318 switch (c) {
8319
8320 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008321 pbuf = formatbuf;
8322 /* presume that buffer length is at least 1 */
8323 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324 len = 1;
8325 break;
8326
8327 case 's':
8328 case 'r':
8329 if (PyUnicode_Check(v) && c == 's') {
8330 temp = v;
8331 Py_INCREF(temp);
8332 }
8333 else {
8334 PyObject *unicode;
8335 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008336 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 else
8338 temp = PyObject_Repr(v);
8339 if (temp == NULL)
8340 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008341 if (PyUnicode_Check(temp))
8342 /* nothing to do */;
8343 else if (PyString_Check(temp)) {
8344 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008345 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008347 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008349 Py_DECREF(temp);
8350 temp = unicode;
8351 if (temp == NULL)
8352 goto onError;
8353 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008354 else {
8355 Py_DECREF(temp);
8356 PyErr_SetString(PyExc_TypeError,
8357 "%s argument has non-string str()");
8358 goto onError;
8359 }
8360 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008361 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362 len = PyUnicode_GET_SIZE(temp);
8363 if (prec >= 0 && len > prec)
8364 len = prec;
8365 break;
8366
8367 case 'i':
8368 case 'd':
8369 case 'u':
8370 case 'o':
8371 case 'x':
8372 case 'X':
8373 if (c == 'i')
8374 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008375 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008376 temp = formatlong(v, flags, prec, c);
8377 if (!temp)
8378 goto onError;
8379 pbuf = PyUnicode_AS_UNICODE(temp);
8380 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008381 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008383 else {
8384 pbuf = formatbuf;
8385 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8386 flags, prec, c, v);
8387 if (len < 0)
8388 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008389 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008390 }
8391 if (flags & F_ZERO)
8392 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008393 break;
8394
8395 case 'e':
8396 case 'E':
8397 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008398 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399 case 'g':
8400 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008401 if (c == 'F')
8402 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008403 pbuf = formatbuf;
8404 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8405 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008406 if (len < 0)
8407 goto onError;
8408 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008409 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410 fill = '0';
8411 break;
8412
8413 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008414 pbuf = formatbuf;
8415 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008416 if (len < 0)
8417 goto onError;
8418 break;
8419
8420 default:
8421 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008422 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008423 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008424 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008425 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008426 (Py_ssize_t)(fmt - 1 -
8427 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428 goto onError;
8429 }
8430 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008431 if (*pbuf == '-' || *pbuf == '+') {
8432 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433 len--;
8434 }
8435 else if (flags & F_SIGN)
8436 sign = '+';
8437 else if (flags & F_BLANK)
8438 sign = ' ';
8439 else
8440 sign = 0;
8441 }
8442 if (width < len)
8443 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008444 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445 reslen -= rescnt;
8446 rescnt = width + fmtcnt + 100;
8447 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008448 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008449 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008450 PyErr_NoMemory();
8451 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008452 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008453 if (_PyUnicode_Resize(&result, reslen) < 0) {
8454 Py_XDECREF(temp);
8455 goto onError;
8456 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457 res = PyUnicode_AS_UNICODE(result)
8458 + reslen - rescnt;
8459 }
8460 if (sign) {
8461 if (fill != ' ')
8462 *res++ = sign;
8463 rescnt--;
8464 if (width > len)
8465 width--;
8466 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008467 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008468 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008469 assert(pbuf[1] == c);
8470 if (fill != ' ') {
8471 *res++ = *pbuf++;
8472 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008473 }
Tim Petersfff53252001-04-12 18:38:48 +00008474 rescnt -= 2;
8475 width -= 2;
8476 if (width < 0)
8477 width = 0;
8478 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008479 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480 if (width > len && !(flags & F_LJUST)) {
8481 do {
8482 --rescnt;
8483 *res++ = fill;
8484 } while (--width > len);
8485 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008486 if (fill == ' ') {
8487 if (sign)
8488 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008489 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008490 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008491 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008492 *res++ = *pbuf++;
8493 *res++ = *pbuf++;
8494 }
8495 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008496 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008497 res += len;
8498 rescnt -= len;
8499 while (--width >= len) {
8500 --rescnt;
8501 *res++ = ' ';
8502 }
8503 if (dict && (argidx < arglen) && c != '%') {
8504 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008505 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008506 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507 goto onError;
8508 }
8509 Py_XDECREF(temp);
8510 } /* '%' */
8511 } /* until end */
8512 if (argidx < arglen && !dict) {
8513 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008514 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515 goto onError;
8516 }
8517
Thomas Woutersa96affe2006-03-12 00:29:36 +00008518 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8519 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520 if (args_owned) {
8521 Py_DECREF(args);
8522 }
8523 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524 return (PyObject *)result;
8525
8526 onError:
8527 Py_XDECREF(result);
8528 Py_DECREF(uformat);
8529 if (args_owned) {
8530 Py_DECREF(args);
8531 }
8532 return NULL;
8533}
8534
8535static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008536 (readbufferproc) unicode_buffer_getreadbuf,
8537 (writebufferproc) unicode_buffer_getwritebuf,
8538 (segcountproc) unicode_buffer_getsegcount,
8539 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540};
8541
Jeremy Hylton938ace62002-07-17 16:30:39 +00008542static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008543unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8544
Tim Peters6d6c1a32001-08-02 04:15:00 +00008545static PyObject *
8546unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8547{
8548 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008549 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008550 char *encoding = NULL;
8551 char *errors = NULL;
8552
Guido van Rossume023fe02001-08-30 03:12:59 +00008553 if (type != &PyUnicode_Type)
8554 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008555 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8556 kwlist, &x, &encoding, &errors))
8557 return NULL;
8558 if (x == NULL)
8559 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008560 if (encoding == NULL && errors == NULL)
8561 return PyObject_Unicode(x);
8562 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008563 return PyUnicode_FromEncodedObject(x, encoding, errors);
8564}
8565
Guido van Rossume023fe02001-08-30 03:12:59 +00008566static PyObject *
8567unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8568{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008569 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008570 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008571
8572 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8573 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8574 if (tmp == NULL)
8575 return NULL;
8576 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008577 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008578 if (pnew == NULL) {
8579 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008580 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008581 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008582 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8583 if (pnew->str == NULL) {
8584 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008585 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008586 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008587 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008588 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008589 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8590 pnew->length = n;
8591 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008592 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008593 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008594}
8595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008596PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008597"unicode(string [, encoding[, errors]]) -> object\n\
8598\n\
8599Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008600encoding defaults to the current default string encoding.\n\
8601errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008602
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008603static PyObject *unicode_iter(PyObject *seq);
8604
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008606 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008607 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608 sizeof(PyUnicodeObject), /* tp_size */
8609 0, /* tp_itemsize */
8610 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008611 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008613 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008615 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008616 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008617 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008619 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620 (hashfunc) unicode_hash, /* tp_hash*/
8621 0, /* tp_call*/
8622 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008623 PyObject_GenericGetAttr, /* tp_getattro */
8624 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008626 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8627 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008628 unicode_doc, /* tp_doc */
8629 0, /* tp_traverse */
8630 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008631 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008632 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008633 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008634 0, /* tp_iternext */
8635 unicode_methods, /* tp_methods */
8636 0, /* tp_members */
8637 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008638 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008639 0, /* tp_dict */
8640 0, /* tp_descr_get */
8641 0, /* tp_descr_set */
8642 0, /* tp_dictoffset */
8643 0, /* tp_init */
8644 0, /* tp_alloc */
8645 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008646 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647};
8648
8649/* Initialize the Unicode implementation */
8650
Thomas Wouters78890102000-07-22 19:25:51 +00008651void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008653 int i;
8654
Thomas Wouters477c8d52006-05-27 19:21:47 +00008655 /* XXX - move this array to unicodectype.c ? */
8656 Py_UNICODE linebreak[] = {
8657 0x000A, /* LINE FEED */
8658 0x000D, /* CARRIAGE RETURN */
8659 0x001C, /* FILE SEPARATOR */
8660 0x001D, /* GROUP SEPARATOR */
8661 0x001E, /* RECORD SEPARATOR */
8662 0x0085, /* NEXT LINE */
8663 0x2028, /* LINE SEPARATOR */
8664 0x2029, /* PARAGRAPH SEPARATOR */
8665 };
8666
Fred Drakee4315f52000-05-09 19:53:39 +00008667 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008668 unicode_freelist = NULL;
8669 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008671 if (!unicode_empty)
8672 return;
8673
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008674 for (i = 0; i < 256; i++)
8675 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008676 if (PyType_Ready(&PyUnicode_Type) < 0)
8677 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008678
8679 /* initialize the linebreak bloom filter */
8680 bloom_linebreak = make_bloom_mask(
8681 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8682 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008683
8684 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685}
8686
8687/* Finalize the Unicode implementation */
8688
8689void
Thomas Wouters78890102000-07-22 19:25:51 +00008690_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008691{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008692 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008693 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008694
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008695 Py_XDECREF(unicode_empty);
8696 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008697
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008698 for (i = 0; i < 256; i++) {
8699 if (unicode_latin1[i]) {
8700 Py_DECREF(unicode_latin1[i]);
8701 unicode_latin1[i] = NULL;
8702 }
8703 }
8704
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008705 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008706 PyUnicodeObject *v = u;
8707 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008708 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008709 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008710 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008711 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008713 unicode_freelist = NULL;
8714 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008715}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008716
Walter Dörwald16807132007-05-25 13:52:07 +00008717void
8718PyUnicode_InternInPlace(PyObject **p)
8719{
8720 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8721 PyObject *t;
8722 if (s == NULL || !PyUnicode_Check(s))
8723 Py_FatalError(
8724 "PyUnicode_InternInPlace: unicode strings only please!");
8725 /* If it's a subclass, we don't really know what putting
8726 it in the interned dict might do. */
8727 if (!PyUnicode_CheckExact(s))
8728 return;
8729 if (PyUnicode_CHECK_INTERNED(s))
8730 return;
8731 if (interned == NULL) {
8732 interned = PyDict_New();
8733 if (interned == NULL) {
8734 PyErr_Clear(); /* Don't leave an exception */
8735 return;
8736 }
8737 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008738 /* It might be that the GetItem call fails even
8739 though the key is present in the dictionary,
8740 namely when this happens during a stack overflow. */
8741 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00008742 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00008743 Py_END_ALLOW_RECURSION
8744
Walter Dörwald16807132007-05-25 13:52:07 +00008745 if (t) {
8746 Py_INCREF(t);
8747 Py_DECREF(*p);
8748 *p = t;
8749 return;
8750 }
8751
Martin v. Löwis5b222132007-06-10 09:51:05 +00008752 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008753 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
8754 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00008755 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008756 return;
8757 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008758 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008759 /* The two references in interned are not counted by refcnt.
8760 The deallocator will take care of this */
8761 s->ob_refcnt -= 2;
8762 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
8763}
8764
8765void
8766PyUnicode_InternImmortal(PyObject **p)
8767{
8768 PyUnicode_InternInPlace(p);
8769 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
8770 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
8771 Py_INCREF(*p);
8772 }
8773}
8774
8775PyObject *
8776PyUnicode_InternFromString(const char *cp)
8777{
8778 PyObject *s = PyUnicode_FromString(cp);
8779 if (s == NULL)
8780 return NULL;
8781 PyUnicode_InternInPlace(&s);
8782 return s;
8783}
8784
8785void _Py_ReleaseInternedUnicodeStrings(void)
8786{
8787 PyObject *keys;
8788 PyUnicodeObject *s;
8789 Py_ssize_t i, n;
8790 Py_ssize_t immortal_size = 0, mortal_size = 0;
8791
8792 if (interned == NULL || !PyDict_Check(interned))
8793 return;
8794 keys = PyDict_Keys(interned);
8795 if (keys == NULL || !PyList_Check(keys)) {
8796 PyErr_Clear();
8797 return;
8798 }
8799
8800 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
8801 detector, interned unicode strings are not forcibly deallocated;
8802 rather, we give them their stolen references back, and then clear
8803 and DECREF the interned dict. */
8804
8805 n = PyList_GET_SIZE(keys);
8806 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
8807 n);
8808 for (i = 0; i < n; i++) {
8809 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
8810 switch (s->state) {
8811 case SSTATE_NOT_INTERNED:
8812 /* XXX Shouldn't happen */
8813 break;
8814 case SSTATE_INTERNED_IMMORTAL:
8815 s->ob_refcnt += 1;
8816 immortal_size += s->length;
8817 break;
8818 case SSTATE_INTERNED_MORTAL:
8819 s->ob_refcnt += 2;
8820 mortal_size += s->length;
8821 break;
8822 default:
8823 Py_FatalError("Inconsistent interned string state.");
8824 }
8825 s->state = SSTATE_NOT_INTERNED;
8826 }
8827 fprintf(stderr, "total size of all interned strings: "
8828 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
8829 "mortal/immortal\n", mortal_size, immortal_size);
8830 Py_DECREF(keys);
8831 PyDict_Clear(interned);
8832 Py_DECREF(interned);
8833 interned = NULL;
8834}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008835
8836
8837/********************* Unicode Iterator **************************/
8838
8839typedef struct {
8840 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008841 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008842 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8843} unicodeiterobject;
8844
8845static void
8846unicodeiter_dealloc(unicodeiterobject *it)
8847{
8848 _PyObject_GC_UNTRACK(it);
8849 Py_XDECREF(it->it_seq);
8850 PyObject_GC_Del(it);
8851}
8852
8853static int
8854unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8855{
8856 Py_VISIT(it->it_seq);
8857 return 0;
8858}
8859
8860static PyObject *
8861unicodeiter_next(unicodeiterobject *it)
8862{
8863 PyUnicodeObject *seq;
8864 PyObject *item;
8865
8866 assert(it != NULL);
8867 seq = it->it_seq;
8868 if (seq == NULL)
8869 return NULL;
8870 assert(PyUnicode_Check(seq));
8871
8872 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008873 item = PyUnicode_FromUnicode(
8874 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008875 if (item != NULL)
8876 ++it->it_index;
8877 return item;
8878 }
8879
8880 Py_DECREF(seq);
8881 it->it_seq = NULL;
8882 return NULL;
8883}
8884
8885static PyObject *
8886unicodeiter_len(unicodeiterobject *it)
8887{
8888 Py_ssize_t len = 0;
8889 if (it->it_seq)
8890 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8891 return PyInt_FromSsize_t(len);
8892}
8893
8894PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8895
8896static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008897 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8898 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008899 {NULL, NULL} /* sentinel */
8900};
8901
8902PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008903 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008904 "unicodeiterator", /* tp_name */
8905 sizeof(unicodeiterobject), /* tp_basicsize */
8906 0, /* tp_itemsize */
8907 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008908 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008909 0, /* tp_print */
8910 0, /* tp_getattr */
8911 0, /* tp_setattr */
8912 0, /* tp_compare */
8913 0, /* tp_repr */
8914 0, /* tp_as_number */
8915 0, /* tp_as_sequence */
8916 0, /* tp_as_mapping */
8917 0, /* tp_hash */
8918 0, /* tp_call */
8919 0, /* tp_str */
8920 PyObject_GenericGetAttr, /* tp_getattro */
8921 0, /* tp_setattro */
8922 0, /* tp_as_buffer */
8923 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8924 0, /* tp_doc */
8925 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8926 0, /* tp_clear */
8927 0, /* tp_richcompare */
8928 0, /* tp_weaklistoffset */
8929 PyObject_SelfIter, /* tp_iter */
8930 (iternextfunc)unicodeiter_next, /* tp_iternext */
8931 unicodeiter_methods, /* tp_methods */
8932 0,
8933};
8934
8935static PyObject *
8936unicode_iter(PyObject *seq)
8937{
8938 unicodeiterobject *it;
8939
8940 if (!PyUnicode_Check(seq)) {
8941 PyErr_BadInternalCall();
8942 return NULL;
8943 }
8944 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8945 if (it == NULL)
8946 return NULL;
8947 it->it_index = 0;
8948 Py_INCREF(seq);
8949 it->it_seq = (PyUnicodeObject *)seq;
8950 _PyObject_GC_TRACK(it);
8951 return (PyObject *)it;
8952}
8953
Martin v. Löwis5b222132007-06-10 09:51:05 +00008954size_t
8955Py_UNICODE_strlen(const Py_UNICODE *u)
8956{
8957 int res = 0;
8958 while(*u++)
8959 res++;
8960 return res;
8961}
8962
8963Py_UNICODE*
8964Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
8965{
8966 Py_UNICODE *u = s1;
8967 while ((*u++ = *s2++));
8968 return s1;
8969}
8970
8971Py_UNICODE*
8972Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
8973{
8974 Py_UNICODE *u = s1;
8975 while ((*u++ = *s2++))
8976 if (n-- == 0)
8977 break;
8978 return s1;
8979}
8980
8981int
8982Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
8983{
8984 while (*s1 && *s2 && *s1 == *s2)
8985 s1++, s2++;
8986 if (*s1 && *s2)
8987 return (*s1 < *s2) ? -1 : +1;
8988 if (*s1)
8989 return 1;
8990 if (*s2)
8991 return -1;
8992 return 0;
8993}
8994
8995Py_UNICODE*
8996Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
8997{
8998 const Py_UNICODE *p;
8999 for (p = s; *p; p++)
9000 if (*p == c)
9001 return (Py_UNICODE*)p;
9002 return NULL;
9003}
9004
9005
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009006#ifdef __cplusplus
9007}
9008#endif
9009
9010
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009011/*
9012Local variables:
9013c-basic-offset: 4
9014indent-tabs-mode: nil
9015End:
9016*/