blob: 6b27adb73c594d201f70367527950cdf938a1e4f [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
101 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000106static PyUnicodeObject *unicode_freelist;
107static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Fred Drakee4315f52000-05-09 19:53:39 +0000116/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000117 parameter; it is fixed to "utf-8". Always use the
118 PyUnicode_GetDefaultEncoding() API to access this global. */
119static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000120
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000121Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000122PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000124#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125 return 0x10FFFF;
126#else
127 /* This is actually an illegal character, so it should
128 not be passed to unichr. */
129 return 0xFFFF;
130#endif
131}
132
Thomas Wouters477c8d52006-05-27 19:21:47 +0000133/* --- Bloom Filters ----------------------------------------------------- */
134
135/* stuff to implement simple "bloom filters" for Unicode characters.
136 to keep things simple, we use a single bitmask, using the least 5
137 bits from each unicode characters as the bit index. */
138
139/* the linebreak mask is set up by Unicode_Init below */
140
141#define BLOOM_MASK unsigned long
142
143static BLOOM_MASK bloom_linebreak;
144
145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
146
147#define BLOOM_LINEBREAK(ch)\
148 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
149
150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
151{
152 /* calculate simple bloom-style bitmask for a given unicode string */
153
154 long mask;
155 Py_ssize_t i;
156
157 mask = 0;
158 for (i = 0; i < len; i++)
159 mask |= (1 << (ptr[i] & 0x1F));
160
161 return mask;
162}
163
164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
165{
166 Py_ssize_t i;
167
168 for (i = 0; i < setlen; i++)
169 if (set[i] == chr)
170 return 1;
171
172 return 0;
173}
174
175#define BLOOM_MEMBER(mask, chr, set, setlen)\
176 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
177
Guido van Rossumd57fd912000-03-10 22:53:23 +0000178/* --- Unicode Object ----------------------------------------------------- */
179
180static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000182 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183{
184 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000185
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000186 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190 /* Resizing shared object (unicode_empty or single character
191 objects) in-place is not allowed. Use PyUnicode_Resize()
192 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000194 if (unicode == unicode_empty ||
195 (unicode->length == 1 &&
196 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 return -1;
201 }
202
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203 /* We allocate one more byte to make sure the string is Ux0000 terminated.
204 The overallocation is also used by fastsearch, which assumes that it's
205 safe to look at str[length] (without making any assumptions about what
206 it contains). */
207
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 oldstr = unicode->str;
209 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
210 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 PyErr_NoMemory();
213 return -1;
214 }
215 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000216 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000220 if (unicode->defenc) {
221 Py_DECREF(unicode->defenc);
222 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 }
224 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000225
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 return 0;
227}
228
229/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000230 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231
232 XXX This allocator could further be enhanced by assuring that the
233 free list never reduces its size below 1.
234
235*/
236
237static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000238PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239{
240 register PyUnicodeObject *unicode;
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (length == 0 && unicode_empty != NULL) {
244 Py_INCREF(unicode_empty);
245 return unicode_empty;
246 }
247
248 /* Unicode freelist & memory allocation */
249 if (unicode_freelist) {
250 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000251 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 /* Keep-Alive optimization: we only upsize the buffer,
255 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000256 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000257 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000258 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000259 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000262 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000264 }
265 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 }
267 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000268 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 if (unicode == NULL)
270 return NULL;
271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
272 }
273
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000274 if (!unicode->str) {
275 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000276 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000278 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000279 * the caller fails before initializing str -- unicode_resize()
280 * reads str[0], and the Keep-Alive optimization can keep memory
281 * allocated for str alive across a call to unicode_dealloc(unicode).
282 * We don't want unicode_resize to read uninitialized memory in
283 * that case.
284 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000285 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000289 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000292
293 onError:
294 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000295 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297}
298
299static
Guido van Rossum9475a232001-10-05 20:51:39 +0000300void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
Walter Dörwald16807132007-05-25 13:52:07 +0000302 switch (PyUnicode_CHECK_INTERNED(unicode)) {
303 case SSTATE_NOT_INTERNED:
304 break;
305
306 case SSTATE_INTERNED_MORTAL:
307 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000308 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000309 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
310 Py_FatalError(
311 "deletion of interned unicode string failed");
312 break;
313
314 case SSTATE_INTERNED_IMMORTAL:
315 Py_FatalError("Immortal interned unicode string died.");
316
317 default:
318 Py_FatalError("Inconsistent interned unicode string state.");
319 }
320
Guido van Rossum604ddf82001-12-06 20:03:56 +0000321 if (PyUnicode_CheckExact(unicode) &&
322 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000323 /* Keep-Alive optimization */
324 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000325 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 unicode->str = NULL;
327 unicode->length = 0;
328 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000329 if (unicode->defenc) {
330 Py_DECREF(unicode->defenc);
331 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000332 }
333 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 *(PyUnicodeObject **)unicode = unicode_freelist;
335 unicode_freelist = unicode;
336 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 }
338 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000339 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000340 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000341 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 }
343}
344
Martin v. Löwis18e16552006-02-15 17:27:45 +0000345int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000346{
347 register PyUnicodeObject *v;
348
349 /* Argument checks */
350 if (unicode == NULL) {
351 PyErr_BadInternalCall();
352 return -1;
353 }
354 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000355 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000356 PyErr_BadInternalCall();
357 return -1;
358 }
359
360 /* Resizing unicode_empty and single character objects is not
361 possible since these are being shared. We simply return a fresh
362 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000363 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 (v == unicode_empty || v->length == 1)) {
365 PyUnicodeObject *w = _PyUnicode_New(length);
366 if (w == NULL)
367 return -1;
368 Py_UNICODE_COPY(w->str, v->str,
369 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000370 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000371 *unicode = (PyObject *)w;
372 return 0;
373 }
374
375 /* Note that we don't have to modify *unicode for unshared Unicode
376 objects, since we can modify them in-place. */
377 return unicode_resize(v, length);
378}
379
380/* Internal API for use in unicodeobject.c only ! */
381#define _PyUnicode_Resize(unicodevar, length) \
382 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
383
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000385 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386{
387 PyUnicodeObject *unicode;
388
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000389 /* If the Unicode data is known at construction time, we can apply
390 some optimizations which share commonly used objects. */
391 if (u != NULL) {
392
393 /* Optimization for empty strings */
394 if (size == 0 && unicode_empty != NULL) {
395 Py_INCREF(unicode_empty);
396 return (PyObject *)unicode_empty;
397 }
398
399 /* Single character Unicode objects in the Latin-1 range are
400 shared when using this constructor */
401 if (size == 1 && *u < 256) {
402 unicode = unicode_latin1[*u];
403 if (!unicode) {
404 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000405 if (!unicode)
406 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000407 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 unicode_latin1[*u] = unicode;
409 }
410 Py_INCREF(unicode);
411 return (PyObject *)unicode;
412 }
413 }
Tim Petersced69f82003-09-16 20:30:58 +0000414
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 unicode = _PyUnicode_New(size);
416 if (!unicode)
417 return NULL;
418
419 /* Copy the Unicode data into the new object */
420 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000421 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000422
423 return (PyObject *)unicode;
424}
425
Walter Dörwaldd2034312007-05-18 16:29:38 +0000426PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000427{
428 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000429 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000430 some optimizations which share commonly used objects.
431 Also, this means the input must be UTF-8, so fall back to the
432 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000433 if (u != NULL) {
434
435 /* Optimization for empty strings */
436 if (size == 0 && unicode_empty != NULL) {
437 Py_INCREF(unicode_empty);
438 return (PyObject *)unicode_empty;
439 }
440
Martin v. Löwis9c121062007-08-05 20:26:11 +0000441 /* Single characters are shared when using this constructor.
442 Restrict to ASCII, since the input must be UTF-8. */
443 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000444 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000445 if (!unicode) {
446 unicode = _PyUnicode_New(1);
447 if (!unicode)
448 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000449 unicode->str[0] = Py_CHARMASK(*u);
450 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000451 }
452 Py_INCREF(unicode);
453 return (PyObject *)unicode;
454 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000455
456 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000457 }
458
Walter Dörwald55507312007-05-18 13:12:10 +0000459 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000460 if (!unicode)
461 return NULL;
462
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000463 return (PyObject *)unicode;
464}
465
Walter Dörwaldd2034312007-05-18 16:29:38 +0000466PyObject *PyUnicode_FromString(const char *u)
467{
468 size_t size = strlen(u);
469 if (size > PY_SSIZE_T_MAX) {
470 PyErr_SetString(PyExc_OverflowError, "input too long");
471 return NULL;
472 }
473
474 return PyUnicode_FromStringAndSize(u, size);
475}
476
Guido van Rossumd57fd912000-03-10 22:53:23 +0000477#ifdef HAVE_WCHAR_H
478
479PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000480 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000481{
482 PyUnicodeObject *unicode;
483
484 if (w == NULL) {
485 PyErr_BadInternalCall();
486 return NULL;
487 }
488
489 unicode = _PyUnicode_New(size);
490 if (!unicode)
491 return NULL;
492
493 /* Copy the wchar_t data into the new object */
494#ifdef HAVE_USABLE_WCHAR_T
495 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000496#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497 {
498 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000499 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000501 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000502 *u++ = *w++;
503 }
504#endif
505
506 return (PyObject *)unicode;
507}
508
Walter Dörwald346737f2007-05-31 10:44:43 +0000509static void
510makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
511{
512 *fmt++ = '%';
513 if (width) {
514 if (zeropad)
515 *fmt++ = '0';
516 fmt += sprintf(fmt, "%d", width);
517 }
518 if (precision)
519 fmt += sprintf(fmt, ".%d", precision);
520 if (longflag)
521 *fmt++ = 'l';
522 else if (size_tflag) {
523 char *f = PY_FORMAT_SIZE_T;
524 while (*f)
525 *fmt++ = *f++;
526 }
527 *fmt++ = c;
528 *fmt = '\0';
529}
530
Walter Dörwaldd2034312007-05-18 16:29:38 +0000531#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
532
533PyObject *
534PyUnicode_FromFormatV(const char *format, va_list vargs)
535{
536 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000537 Py_ssize_t callcount = 0;
538 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000539 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000540 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000541 int width = 0;
542 int precision = 0;
543 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000544 const char* f;
545 Py_UNICODE *s;
546 PyObject *string;
547 /* used by sprintf */
548 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000549 /* use abuffer instead of buffer, if we need more space
550 * (which can happen if there's a format specifier with width). */
551 char *abuffer = NULL;
552 char *realbuffer;
553 Py_ssize_t abuffersize = 0;
554 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000555 const char *copy;
556
557#ifdef VA_LIST_IS_ARRAY
558 Py_MEMCPY(count, vargs, sizeof(va_list));
559#else
560#ifdef __va_copy
561 __va_copy(count, vargs);
562#else
563 count = vargs;
564#endif
565#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000566 /* step 1: count the number of %S/%R format specifications
567 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
568 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000569 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000570 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000571 ++callcount;
572 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000573 /* step 2: allocate memory for the results of
574 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000575 if (callcount) {
576 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
577 if (!callresults) {
578 PyErr_NoMemory();
579 return NULL;
580 }
581 callresult = callresults;
582 }
583 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000584 for (f = format; *f; f++) {
585 if (*f == '%') {
586 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000587 width = 0;
588 while (isdigit(Py_CHARMASK(*f)))
589 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000590 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
591 ;
592
593 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
594 * they don't affect the amount of space we reserve.
595 */
596 if ((*f == 'l' || *f == 'z') &&
597 (f[1] == 'd' || f[1] == 'u'))
598 ++f;
599
600 switch (*f) {
601 case 'c':
602 (void)va_arg(count, int);
603 /* fall through... */
604 case '%':
605 n++;
606 break;
607 case 'd': case 'u': case 'i': case 'x':
608 (void) va_arg(count, int);
609 /* 20 bytes is enough to hold a 64-bit
610 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000611 This isn't enough for octal.
612 If a width is specified we need more
613 (which we allocate later). */
614 if (width < 20)
615 width = 20;
616 n += width;
617 if (abuffersize < width)
618 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000619 break;
620 case 's':
621 n += strlen(va_arg(count, char*));
622 break;
623 case 'U':
624 {
625 PyObject *obj = va_arg(count, PyObject *);
626 assert(obj && PyUnicode_Check(obj));
627 n += PyUnicode_GET_SIZE(obj);
628 break;
629 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000630 case 'V':
631 {
632 PyObject *obj = va_arg(count, PyObject *);
633 const char *str = va_arg(count, const char *);
634 assert(obj || str);
635 assert(!obj || PyUnicode_Check(obj));
636 if (obj)
637 n += PyUnicode_GET_SIZE(obj);
638 else
639 n += strlen(str);
640 break;
641 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000642 case 'S':
643 {
644 PyObject *obj = va_arg(count, PyObject *);
645 PyObject *str;
646 assert(obj);
647 str = PyObject_Unicode(obj);
648 if (!str)
649 goto fail;
650 n += PyUnicode_GET_SIZE(str);
651 /* Remember the str and switch to the next slot */
652 *callresult++ = str;
653 break;
654 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000655 case 'R':
656 {
657 PyObject *obj = va_arg(count, PyObject *);
658 PyObject *repr;
659 assert(obj);
660 repr = PyObject_Repr(obj);
661 if (!repr)
662 goto fail;
663 n += PyUnicode_GET_SIZE(repr);
664 /* Remember the repr and switch to the next slot */
665 *callresult++ = repr;
666 break;
667 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000668 case 'p':
669 (void) va_arg(count, int);
670 /* maximum 64-bit pointer representation:
671 * 0xffffffffffffffff
672 * so 19 characters is enough.
673 * XXX I count 18 -- what's the extra for?
674 */
675 n += 19;
676 break;
677 default:
678 /* if we stumble upon an unknown
679 formatting code, copy the rest of
680 the format string to the output
681 string. (we cannot just skip the
682 code, since there's no way to know
683 what's in the argument list) */
684 n += strlen(p);
685 goto expand;
686 }
687 } else
688 n++;
689 }
690 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000691 if (abuffersize > 20) {
692 abuffer = PyMem_Malloc(abuffersize);
693 if (!abuffer) {
694 PyErr_NoMemory();
695 goto fail;
696 }
697 realbuffer = abuffer;
698 }
699 else
700 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000701 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000702 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000703 we don't have to resize the string.
704 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000705 string = PyUnicode_FromUnicode(NULL, n);
706 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000707 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000708
709 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000710 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000711
712 for (f = format; *f; f++) {
713 if (*f == '%') {
714 const char* p = f++;
715 int longflag = 0;
716 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000717 zeropad = (*f == '0');
718 /* parse the width.precision part */
719 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000720 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000721 width = (width*10) + *f++ - '0';
722 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000723 if (*f == '.') {
724 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000725 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000726 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000727 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728 /* handle the long flag, but only for %ld and %lu.
729 others can be added when necessary. */
730 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
731 longflag = 1;
732 ++f;
733 }
734 /* handle the size_t flag. */
735 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
736 size_tflag = 1;
737 ++f;
738 }
739
740 switch (*f) {
741 case 'c':
742 *s++ = va_arg(vargs, int);
743 break;
744 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000745 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000746 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000747 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000748 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000749 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000750 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000751 sprintf(realbuffer, fmt, va_arg(vargs, int));
752 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000753 break;
754 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000755 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000756 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000757 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000758 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000759 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000760 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000761 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
762 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763 break;
764 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000765 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
766 sprintf(realbuffer, fmt, va_arg(vargs, int));
767 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000768 break;
769 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000770 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
771 sprintf(realbuffer, fmt, va_arg(vargs, int));
772 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000773 break;
774 case 's':
775 p = va_arg(vargs, char*);
776 appendstring(p);
777 break;
778 case 'U':
779 {
780 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000781 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
782 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
783 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000784 break;
785 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000786 case 'V':
787 {
788 PyObject *obj = va_arg(vargs, PyObject *);
789 const char *str = va_arg(vargs, const char *);
790 if (obj) {
791 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
792 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
793 s += size;
794 } else {
795 appendstring(str);
796 }
797 break;
798 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000799 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000800 case 'R':
801 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000802 Py_UNICODE *ucopy;
803 Py_ssize_t usize;
804 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000805 /* unused, since we already have the result */
806 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000807 ucopy = PyUnicode_AS_UNICODE(*callresult);
808 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000809 for (upos = 0; upos<usize;)
810 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000811 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000812 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000813 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000814 ++callresult;
815 break;
816 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000817 case 'p':
818 sprintf(buffer, "%p", va_arg(vargs, void*));
819 /* %p is ill-defined: ensure leading 0x. */
820 if (buffer[1] == 'X')
821 buffer[1] = 'x';
822 else if (buffer[1] != 'x') {
823 memmove(buffer+2, buffer, strlen(buffer)+1);
824 buffer[0] = '0';
825 buffer[1] = 'x';
826 }
827 appendstring(buffer);
828 break;
829 case '%':
830 *s++ = '%';
831 break;
832 default:
833 appendstring(p);
834 goto end;
835 }
836 } else
837 *s++ = *f;
838 }
839
840 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000841 if (callresults)
842 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000843 if (abuffer)
844 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000845 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
846 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000847 fail:
848 if (callresults) {
849 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000850 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000851 Py_DECREF(*callresult2);
852 ++callresult2;
853 }
854 PyMem_Free(callresults);
855 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000856 if (abuffer)
857 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000858 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000859}
860
861#undef appendstring
862
863PyObject *
864PyUnicode_FromFormat(const char *format, ...)
865{
866 PyObject* ret;
867 va_list vargs;
868
869#ifdef HAVE_STDARG_PROTOTYPES
870 va_start(vargs, format);
871#else
872 va_start(vargs);
873#endif
874 ret = PyUnicode_FromFormatV(format, vargs);
875 va_end(vargs);
876 return ret;
877}
878
Martin v. Löwis18e16552006-02-15 17:27:45 +0000879Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
880 wchar_t *w,
881 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000882{
883 if (unicode == NULL) {
884 PyErr_BadInternalCall();
885 return -1;
886 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000887
888 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000889 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000890 size = PyUnicode_GET_SIZE(unicode) + 1;
891
Guido van Rossumd57fd912000-03-10 22:53:23 +0000892#ifdef HAVE_USABLE_WCHAR_T
893 memcpy(w, unicode->str, size * sizeof(wchar_t));
894#else
895 {
896 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000897 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000898 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000899 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000900 *w++ = *u++;
901 }
902#endif
903
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000904 if (size > PyUnicode_GET_SIZE(unicode))
905 return PyUnicode_GET_SIZE(unicode);
906 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000907 return size;
908}
909
910#endif
911
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000912PyObject *PyUnicode_FromOrdinal(int ordinal)
913{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000914 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000915
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000916 if (ordinal < 0 || ordinal > 0x10ffff) {
917 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000918 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000919 return NULL;
920 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000921
922#ifndef Py_UNICODE_WIDE
923 if (ordinal > 0xffff) {
924 ordinal -= 0x10000;
925 s[0] = 0xD800 | (ordinal >> 10);
926 s[1] = 0xDC00 | (ordinal & 0x3FF);
927 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000928 }
929#endif
930
Hye-Shik Chang40574832004-04-06 07:24:51 +0000931 s[0] = (Py_UNICODE)ordinal;
932 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000933}
934
Guido van Rossumd57fd912000-03-10 22:53:23 +0000935PyObject *PyUnicode_FromObject(register PyObject *obj)
936{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000937 /* XXX Perhaps we should make this API an alias of
938 PyObject_Unicode() instead ?! */
939 if (PyUnicode_CheckExact(obj)) {
940 Py_INCREF(obj);
941 return obj;
942 }
943 if (PyUnicode_Check(obj)) {
944 /* For a Unicode subtype that's not a Unicode object,
945 return a true Unicode object with the same data. */
946 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
947 PyUnicode_GET_SIZE(obj));
948 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000949 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
950}
951
952PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
953 const char *encoding,
954 const char *errors)
955{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000956 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000957 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000958 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000959
Guido van Rossumd57fd912000-03-10 22:53:23 +0000960 if (obj == NULL) {
961 PyErr_BadInternalCall();
962 return NULL;
963 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000964
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000965#if 0
966 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000967 that no encodings is given and then redirect to
968 PyObject_Unicode() which then applies the additional logic for
969 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000970
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000971 NOTE: This API should really only be used for object which
972 represent *encoded* Unicode !
973
974 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000975 if (PyUnicode_Check(obj)) {
976 if (encoding) {
977 PyErr_SetString(PyExc_TypeError,
978 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000979 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000980 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000981 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000982 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000983#else
984 if (PyUnicode_Check(obj)) {
985 PyErr_SetString(PyExc_TypeError,
986 "decoding Unicode is not supported");
987 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000988 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000989#endif
990
991 /* Coerce object */
992 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000993 s = PyString_AS_STRING(obj);
994 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000995 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000996 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
997 /* Overwrite the error message with something more useful in
998 case of a TypeError. */
999 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001000 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001001 "coercing to Unicode: need string or buffer, "
1002 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001003 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001004 goto onError;
1005 }
Tim Petersced69f82003-09-16 20:30:58 +00001006
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001007 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008 if (len == 0) {
1009 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001010 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001011 }
Tim Petersced69f82003-09-16 20:30:58 +00001012 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001013 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001014
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001015 return v;
1016
1017 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001018 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001019}
1020
1021PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001022 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023 const char *encoding,
1024 const char *errors)
1025{
1026 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001027
1028 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001029 encoding = PyUnicode_GetDefaultEncoding();
1030
1031 /* Shortcuts for common default encodings */
1032 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001034 else if (strcmp(encoding, "latin-1") == 0)
1035 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001036#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1037 else if (strcmp(encoding, "mbcs") == 0)
1038 return PyUnicode_DecodeMBCS(s, size, errors);
1039#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001040 else if (strcmp(encoding, "ascii") == 0)
1041 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042
1043 /* Decode via the codec registry */
1044 buffer = PyBuffer_FromMemory((void *)s, size);
1045 if (buffer == NULL)
1046 goto onError;
1047 unicode = PyCodec_Decode(buffer, encoding, errors);
1048 if (unicode == NULL)
1049 goto onError;
1050 if (!PyUnicode_Check(unicode)) {
1051 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001052 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001053 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001054 Py_DECREF(unicode);
1055 goto onError;
1056 }
1057 Py_DECREF(buffer);
1058 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001059
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 onError:
1061 Py_XDECREF(buffer);
1062 return NULL;
1063}
1064
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001065PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1066 const char *encoding,
1067 const char *errors)
1068{
1069 PyObject *v;
1070
1071 if (!PyUnicode_Check(unicode)) {
1072 PyErr_BadArgument();
1073 goto onError;
1074 }
1075
1076 if (encoding == NULL)
1077 encoding = PyUnicode_GetDefaultEncoding();
1078
1079 /* Decode via the codec registry */
1080 v = PyCodec_Decode(unicode, encoding, errors);
1081 if (v == NULL)
1082 goto onError;
1083 return v;
1084
1085 onError:
1086 return NULL;
1087}
1088
Guido van Rossumd57fd912000-03-10 22:53:23 +00001089PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001090 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 const char *encoding,
1092 const char *errors)
1093{
1094 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001095
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 unicode = PyUnicode_FromUnicode(s, size);
1097 if (unicode == NULL)
1098 return NULL;
1099 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1100 Py_DECREF(unicode);
1101 return v;
1102}
1103
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001104PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1105 const char *encoding,
1106 const char *errors)
1107{
1108 PyObject *v;
1109
1110 if (!PyUnicode_Check(unicode)) {
1111 PyErr_BadArgument();
1112 goto onError;
1113 }
1114
1115 if (encoding == NULL)
1116 encoding = PyUnicode_GetDefaultEncoding();
1117
1118 /* Encode via the codec registry */
1119 v = PyCodec_Encode(unicode, encoding, errors);
1120 if (v == NULL)
1121 goto onError;
1122 return v;
1123
1124 onError:
1125 return NULL;
1126}
1127
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1129 const char *encoding,
1130 const char *errors)
1131{
1132 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001133
Guido van Rossumd57fd912000-03-10 22:53:23 +00001134 if (!PyUnicode_Check(unicode)) {
1135 PyErr_BadArgument();
1136 goto onError;
1137 }
Fred Drakee4315f52000-05-09 19:53:39 +00001138
Tim Petersced69f82003-09-16 20:30:58 +00001139 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001140 encoding = PyUnicode_GetDefaultEncoding();
1141
1142 /* Shortcuts for common default encodings */
1143 if (errors == NULL) {
1144 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001145 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001146 else if (strcmp(encoding, "latin-1") == 0)
1147 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001148#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1149 else if (strcmp(encoding, "mbcs") == 0)
1150 return PyUnicode_AsMBCSString(unicode);
1151#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001152 else if (strcmp(encoding, "ascii") == 0)
1153 return PyUnicode_AsASCIIString(unicode);
1154 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001155
1156 /* Encode via the codec registry */
1157 v = PyCodec_Encode(unicode, encoding, errors);
1158 if (v == NULL)
1159 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001160 if (!PyBytes_Check(v)) {
1161 if (PyString_Check(v)) {
1162 /* Old codec, turn it into bytes */
1163 PyObject *b = PyBytes_FromObject(v);
1164 Py_DECREF(v);
1165 return b;
1166 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001168 "encoder did not return a bytes object "
1169 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1170 v->ob_type->tp_name,
1171 encoding ? encoding : "NULL",
1172 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173 Py_DECREF(v);
1174 goto onError;
1175 }
1176 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001177
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178 onError:
1179 return NULL;
1180}
1181
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001182PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1183 const char *errors)
1184{
1185 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001186 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001187 if (v)
1188 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001189 if (errors != NULL)
1190 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1191 if (errors == NULL) {
1192 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1193 PyUnicode_GET_SIZE(unicode),
1194 NULL);
1195 }
1196 else {
1197 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1198 }
1199 if (!b)
1200 return NULL;
1201 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1202 PyBytes_Size(b));
1203 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001204 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001205 return v;
1206}
1207
Martin v. Löwis5b222132007-06-10 09:51:05 +00001208char*
1209PyUnicode_AsString(PyObject *unicode)
1210{
1211 assert(PyUnicode_Check(unicode));
1212 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1213 if (!unicode)
1214 return NULL;
1215 return PyString_AsString(unicode);
1216}
1217
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1219{
1220 if (!PyUnicode_Check(unicode)) {
1221 PyErr_BadArgument();
1222 goto onError;
1223 }
1224 return PyUnicode_AS_UNICODE(unicode);
1225
1226 onError:
1227 return NULL;
1228}
1229
Martin v. Löwis18e16552006-02-15 17:27:45 +00001230Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231{
1232 if (!PyUnicode_Check(unicode)) {
1233 PyErr_BadArgument();
1234 goto onError;
1235 }
1236 return PyUnicode_GET_SIZE(unicode);
1237
1238 onError:
1239 return -1;
1240}
1241
Thomas Wouters78890102000-07-22 19:25:51 +00001242const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001243{
1244 return unicode_default_encoding;
1245}
1246
1247int PyUnicode_SetDefaultEncoding(const char *encoding)
1248{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001249 if (strcmp(encoding, unicode_default_encoding) != 0) {
1250 PyErr_Format(PyExc_ValueError,
1251 "Can only set default encoding to %s",
1252 unicode_default_encoding);
1253 return -1;
1254 }
Fred Drakee4315f52000-05-09 19:53:39 +00001255 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001256}
1257
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001258/* error handling callback helper:
1259 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001260 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001261 and adjust various state variables.
1262 return 0 on success, -1 on error
1263*/
1264
1265static
1266int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1267 const char *encoding, const char *reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001268 const char **input, const char **inend, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001269 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001270{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001271 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001272
1273 PyObject *restuple = NULL;
1274 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001275 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001276 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001277 Py_ssize_t requiredsize;
1278 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001279 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001280 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001281 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001282 int res = -1;
1283
1284 if (*errorHandler == NULL) {
1285 *errorHandler = PyCodec_LookupError(errors);
1286 if (*errorHandler == NULL)
1287 goto onError;
1288 }
1289
1290 if (*exceptionObject == NULL) {
1291 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001292 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001293 if (*exceptionObject == NULL)
1294 goto onError;
1295 }
1296 else {
1297 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1298 goto onError;
1299 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1300 goto onError;
1301 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1302 goto onError;
1303 }
1304
1305 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1306 if (restuple == NULL)
1307 goto onError;
1308 if (!PyTuple_Check(restuple)) {
1309 PyErr_Format(PyExc_TypeError, &argparse[4]);
1310 goto onError;
1311 }
1312 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1313 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001314
1315 /* Copy back the bytes variables, which might have been modified by the
1316 callback */
1317 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1318 if (!inputobj)
1319 goto onError;
1320 if (!PyBytes_Check(inputobj)) {
1321 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1322 }
1323 *input = PyBytes_AS_STRING(inputobj);
1324 insize = PyBytes_GET_SIZE(inputobj);
1325 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001326 /* we can DECREF safely, as the exception has another reference,
1327 so the object won't go away. */
1328 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001329
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001330 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001331 newpos = insize+newpos;
1332 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001333 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001334 goto onError;
1335 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001336
1337 /* need more space? (at least enough for what we
1338 have+the replacement+the rest of the string (starting
1339 at the new input position), so we won't have to check space
1340 when there are no errors in the rest of the string) */
1341 repptr = PyUnicode_AS_UNICODE(repunicode);
1342 repsize = PyUnicode_GET_SIZE(repunicode);
1343 requiredsize = *outpos + repsize + insize-newpos;
1344 if (requiredsize > outsize) {
1345 if (requiredsize<2*outsize)
1346 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001347 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001348 goto onError;
1349 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1350 }
1351 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001352 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001353 Py_UNICODE_COPY(*outptr, repptr, repsize);
1354 *outptr += repsize;
1355 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001356
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001357 /* we made it! */
1358 res = 0;
1359
1360 onError:
1361 Py_XDECREF(restuple);
1362 return res;
1363}
1364
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001365/* --- UTF-7 Codec -------------------------------------------------------- */
1366
1367/* see RFC2152 for details */
1368
Tim Petersced69f82003-09-16 20:30:58 +00001369static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001370char utf7_special[128] = {
1371 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1372 encoded:
1373 0 - not special
1374 1 - special
1375 2 - whitespace (optional)
1376 3 - RFC2152 Set O (optional) */
1377 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1378 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1379 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1380 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1381 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1382 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1383 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1384 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1385
1386};
1387
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001388/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1389 warnings about the comparison always being false; since
1390 utf7_special[0] is 1, we can safely make that one comparison
1391 true */
1392
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001393#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001394 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001395 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001396 (encodeO && (utf7_special[(c)] == 3)))
1397
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001398#define B64(n) \
1399 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1400#define B64CHAR(c) \
1401 (isalnum(c) || (c) == '+' || (c) == '/')
1402#define UB64(c) \
1403 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1404 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001405
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001406#define ENCODE(out, ch, bits) \
1407 while (bits >= 6) { \
1408 *out++ = B64(ch >> (bits-6)); \
1409 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001410 }
1411
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001412#define DECODE(out, ch, bits, surrogate) \
1413 while (bits >= 16) { \
1414 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1415 bits -= 16; \
1416 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001417 /* We have already generated an error for the high surrogate \
1418 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001419 surrogate = 0; \
1420 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001421 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001422 it in a 16-bit character */ \
1423 surrogate = 1; \
1424 errmsg = "code pairs are not supported"; \
1425 goto utf7Error; \
1426 } else { \
1427 *out++ = outCh; \
1428 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001429 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001430
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001431PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001432 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001433 const char *errors)
1434{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001435 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001436 Py_ssize_t startinpos;
1437 Py_ssize_t endinpos;
1438 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001439 const char *e;
1440 PyUnicodeObject *unicode;
1441 Py_UNICODE *p;
1442 const char *errmsg = "";
1443 int inShift = 0;
1444 unsigned int bitsleft = 0;
1445 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001446 int surrogate = 0;
1447 PyObject *errorHandler = NULL;
1448 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001449
1450 unicode = _PyUnicode_New(size);
1451 if (!unicode)
1452 return NULL;
1453 if (size == 0)
1454 return (PyObject *)unicode;
1455
1456 p = unicode->str;
1457 e = s + size;
1458
1459 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001460 Py_UNICODE ch;
1461 restart:
1462 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001463
1464 if (inShift) {
1465 if ((ch == '-') || !B64CHAR(ch)) {
1466 inShift = 0;
1467 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001468
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001469 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1470 if (bitsleft >= 6) {
1471 /* The shift sequence has a partial character in it. If
1472 bitsleft < 6 then we could just classify it as padding
1473 but that is not the case here */
1474
1475 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001476 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001477 }
1478 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001479 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001480 here so indicate the potential of a misencoded character. */
1481
1482 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1483 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1484 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001485 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001486 }
1487
1488 if (ch == '-') {
1489 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001490 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001491 inShift = 1;
1492 }
1493 } else if (SPECIAL(ch,0,0)) {
1494 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001495 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001496 } else {
1497 *p++ = ch;
1498 }
1499 } else {
1500 charsleft = (charsleft << 6) | UB64(ch);
1501 bitsleft += 6;
1502 s++;
1503 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1504 }
1505 }
1506 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001507 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001508 s++;
1509 if (s < e && *s == '-') {
1510 s++;
1511 *p++ = '+';
1512 } else
1513 {
1514 inShift = 1;
1515 bitsleft = 0;
1516 }
1517 }
1518 else if (SPECIAL(ch,0,0)) {
1519 errmsg = "unexpected special character";
1520 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001521 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001522 }
1523 else {
1524 *p++ = ch;
1525 s++;
1526 }
1527 continue;
1528 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001529 outpos = p-PyUnicode_AS_UNICODE(unicode);
1530 endinpos = s-starts;
1531 if (unicode_decode_call_errorhandler(
1532 errors, &errorHandler,
1533 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001534 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001535 (PyObject **)&unicode, &outpos, &p))
1536 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001537 }
1538
1539 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 outpos = p-PyUnicode_AS_UNICODE(unicode);
1541 endinpos = size;
1542 if (unicode_decode_call_errorhandler(
1543 errors, &errorHandler,
1544 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001545 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001546 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001547 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001548 if (s < e)
1549 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001550 }
1551
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001552 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001553 goto onError;
1554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001555 Py_XDECREF(errorHandler);
1556 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001557 return (PyObject *)unicode;
1558
1559onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001560 Py_XDECREF(errorHandler);
1561 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001562 Py_DECREF(unicode);
1563 return NULL;
1564}
1565
1566
1567PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001568 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001569 int encodeSetO,
1570 int encodeWhiteSpace,
1571 const char *errors)
1572{
1573 PyObject *v;
1574 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001575 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001576 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001577 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001578 unsigned int bitsleft = 0;
1579 unsigned long charsleft = 0;
1580 char * out;
1581 char * start;
1582
1583 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001584 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001585
Walter Dörwald51ab4142007-05-05 14:43:36 +00001586 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001587 if (v == NULL)
1588 return NULL;
1589
Walter Dörwald51ab4142007-05-05 14:43:36 +00001590 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001591 for (;i < size; ++i) {
1592 Py_UNICODE ch = s[i];
1593
1594 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001595 if (ch == '+') {
1596 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001597 *out++ = '-';
1598 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1599 charsleft = ch;
1600 bitsleft = 16;
1601 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001602 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001603 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001604 } else {
1605 *out++ = (char) ch;
1606 }
1607 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001608 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1609 *out++ = B64(charsleft << (6-bitsleft));
1610 charsleft = 0;
1611 bitsleft = 0;
1612 /* Characters not in the BASE64 set implicitly unshift the sequence
1613 so no '-' is required, except if the character is itself a '-' */
1614 if (B64CHAR(ch) || ch == '-') {
1615 *out++ = '-';
1616 }
1617 inShift = 0;
1618 *out++ = (char) ch;
1619 } else {
1620 bitsleft += 16;
1621 charsleft = (charsleft << 16) | ch;
1622 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1623
1624 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001625 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001626 or '-' then the shift sequence will be terminated implicitly and we
1627 don't have to insert a '-'. */
1628
1629 if (bitsleft == 0) {
1630 if (i + 1 < size) {
1631 Py_UNICODE ch2 = s[i+1];
1632
1633 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001634
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001635 } else if (B64CHAR(ch2) || ch2 == '-') {
1636 *out++ = '-';
1637 inShift = 0;
1638 } else {
1639 inShift = 0;
1640 }
1641
1642 }
1643 else {
1644 *out++ = '-';
1645 inShift = 0;
1646 }
1647 }
Tim Petersced69f82003-09-16 20:30:58 +00001648 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001649 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001650 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001651 if (bitsleft) {
1652 *out++= B64(charsleft << (6-bitsleft) );
1653 *out++ = '-';
1654 }
1655
Walter Dörwald51ab4142007-05-05 14:43:36 +00001656 if (PyBytes_Resize(v, out - start)) {
1657 Py_DECREF(v);
1658 return NULL;
1659 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001660 return v;
1661}
1662
1663#undef SPECIAL
1664#undef B64
1665#undef B64CHAR
1666#undef UB64
1667#undef ENCODE
1668#undef DECODE
1669
Guido van Rossumd57fd912000-03-10 22:53:23 +00001670/* --- UTF-8 Codec -------------------------------------------------------- */
1671
Tim Petersced69f82003-09-16 20:30:58 +00001672static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673char utf8_code_length[256] = {
1674 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1675 illegal prefix. see RFC 2279 for details */
1676 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1677 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1678 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1679 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1680 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1681 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1682 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1683 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1684 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1685 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1687 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1688 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1689 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1690 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1691 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1692};
1693
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001695 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696 const char *errors)
1697{
Walter Dörwald69652032004-09-07 20:24:22 +00001698 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1699}
1700
1701PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001702 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001703 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001704 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001705{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001708 Py_ssize_t startinpos;
1709 Py_ssize_t endinpos;
1710 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001711 const char *e;
1712 PyUnicodeObject *unicode;
1713 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001714 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001715 PyObject *errorHandler = NULL;
1716 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001717
1718 /* Note: size will always be longer than the resulting Unicode
1719 character count */
1720 unicode = _PyUnicode_New(size);
1721 if (!unicode)
1722 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001723 if (size == 0) {
1724 if (consumed)
1725 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001727 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001728
1729 /* Unpack UTF-8 encoded data */
1730 p = unicode->str;
1731 e = s + size;
1732
1733 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001734 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735
1736 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001737 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738 s++;
1739 continue;
1740 }
1741
1742 n = utf8_code_length[ch];
1743
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001744 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001745 if (consumed)
1746 break;
1747 else {
1748 errmsg = "unexpected end of data";
1749 startinpos = s-starts;
1750 endinpos = size;
1751 goto utf8Error;
1752 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001753 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754
1755 switch (n) {
1756
1757 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001758 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001759 startinpos = s-starts;
1760 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001761 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001762
1763 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001764 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001765 startinpos = s-starts;
1766 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001767 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768
1769 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001770 if ((s[1] & 0xc0) != 0x80) {
1771 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001772 startinpos = s-starts;
1773 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001774 goto utf8Error;
1775 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001777 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001778 startinpos = s-starts;
1779 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001780 errmsg = "illegal encoding";
1781 goto utf8Error;
1782 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001784 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785 break;
1786
1787 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001788 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001789 (s[2] & 0xc0) != 0x80) {
1790 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001791 startinpos = s-starts;
1792 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001793 goto utf8Error;
1794 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001796 if (ch < 0x0800) {
1797 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001798 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001799
1800 XXX For wide builds (UCS-4) we should probably try
1801 to recombine the surrogates into a single code
1802 unit.
1803 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001804 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001805 startinpos = s-starts;
1806 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001807 goto utf8Error;
1808 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001810 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001811 break;
1812
1813 case 4:
1814 if ((s[1] & 0xc0) != 0x80 ||
1815 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001816 (s[3] & 0xc0) != 0x80) {
1817 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 startinpos = s-starts;
1819 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001820 goto utf8Error;
1821 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001822 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1823 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1824 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001825 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001826 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001827 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001828 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001829 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001830 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001831 startinpos = s-starts;
1832 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001833 goto utf8Error;
1834 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001835#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001836 *p++ = (Py_UNICODE)ch;
1837#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001838 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001839
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001840 /* translate from 10000..10FFFF to 0..FFFF */
1841 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001842
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001843 /* high surrogate = top 10 bits added to D800 */
1844 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001845
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001846 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001847 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001848#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849 break;
1850
1851 default:
1852 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001853 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001854 startinpos = s-starts;
1855 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001856 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857 }
1858 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001859 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001860
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001861 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 outpos = p-PyUnicode_AS_UNICODE(unicode);
1863 if (unicode_decode_call_errorhandler(
1864 errors, &errorHandler,
1865 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001866 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 (PyObject **)&unicode, &outpos, &p))
1868 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001869 }
Walter Dörwald69652032004-09-07 20:24:22 +00001870 if (consumed)
1871 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001872
1873 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001874 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875 goto onError;
1876
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001877 Py_XDECREF(errorHandler);
1878 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001879 return (PyObject *)unicode;
1880
1881onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001882 Py_XDECREF(errorHandler);
1883 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884 Py_DECREF(unicode);
1885 return NULL;
1886}
1887
Tim Peters602f7402002-04-27 18:03:26 +00001888/* Allocation strategy: if the string is short, convert into a stack buffer
1889 and allocate exactly as much space needed at the end. Else allocate the
1890 maximum possible needed (4 result bytes per Unicode character), and return
1891 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001892*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001893PyObject *
1894PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001895 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001896 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001897{
Tim Peters602f7402002-04-27 18:03:26 +00001898#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001899
Martin v. Löwis18e16552006-02-15 17:27:45 +00001900 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001901 PyObject *v; /* result string object */
1902 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001903 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001904 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001905 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001906
Tim Peters602f7402002-04-27 18:03:26 +00001907 assert(s != NULL);
1908 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001909
Tim Peters602f7402002-04-27 18:03:26 +00001910 if (size <= MAX_SHORT_UNICHARS) {
1911 /* Write into the stack buffer; nallocated can't overflow.
1912 * At the end, we'll allocate exactly as much heap space as it
1913 * turns out we need.
1914 */
1915 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1916 v = NULL; /* will allocate after we're done */
1917 p = stackbuf;
1918 }
1919 else {
1920 /* Overallocate on the heap, and give the excess back at the end. */
1921 nallocated = size * 4;
1922 if (nallocated / 4 != size) /* overflow! */
1923 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001924 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001925 if (v == NULL)
1926 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001927 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001928 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001929
Tim Peters602f7402002-04-27 18:03:26 +00001930 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001931 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001932
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001933 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001934 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001936
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001938 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001939 *p++ = (char)(0xc0 | (ch >> 6));
1940 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001941 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001942 else {
Tim Peters602f7402002-04-27 18:03:26 +00001943 /* Encode UCS2 Unicode ordinals */
1944 if (ch < 0x10000) {
1945 /* Special case: check for high surrogate */
1946 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1947 Py_UCS4 ch2 = s[i];
1948 /* Check for low surrogate and combine the two to
1949 form a UCS4 value */
1950 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001951 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001952 i++;
1953 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001954 }
Tim Peters602f7402002-04-27 18:03:26 +00001955 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001956 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001957 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001958 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1959 *p++ = (char)(0x80 | (ch & 0x3f));
1960 continue;
1961 }
1962encodeUCS4:
1963 /* Encode UCS4 Unicode ordinals */
1964 *p++ = (char)(0xf0 | (ch >> 18));
1965 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1966 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1967 *p++ = (char)(0x80 | (ch & 0x3f));
1968 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001970
Tim Peters602f7402002-04-27 18:03:26 +00001971 if (v == NULL) {
1972 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001973 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001974 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001975 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001976 }
1977 else {
1978 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001979 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001980 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001981 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001984
Tim Peters602f7402002-04-27 18:03:26 +00001985#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986}
1987
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1989{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990 if (!PyUnicode_Check(unicode)) {
1991 PyErr_BadArgument();
1992 return NULL;
1993 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001994 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1995 PyUnicode_GET_SIZE(unicode),
1996 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997}
1998
1999/* --- UTF-16 Codec ------------------------------------------------------- */
2000
Tim Peters772747b2001-08-09 22:21:55 +00002001PyObject *
2002PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002003 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002004 const char *errors,
2005 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006{
Walter Dörwald69652032004-09-07 20:24:22 +00002007 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2008}
2009
2010PyObject *
2011PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002012 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002013 const char *errors,
2014 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002015 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002016{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002017 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002018 Py_ssize_t startinpos;
2019 Py_ssize_t endinpos;
2020 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 PyUnicodeObject *unicode;
2022 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002023 const unsigned char *q, *e;
2024 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002025 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002026 /* Offsets from q for retrieving byte pairs in the right order. */
2027#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2028 int ihi = 1, ilo = 0;
2029#else
2030 int ihi = 0, ilo = 1;
2031#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002032 PyObject *errorHandler = NULL;
2033 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034
2035 /* Note: size will always be longer than the resulting Unicode
2036 character count */
2037 unicode = _PyUnicode_New(size);
2038 if (!unicode)
2039 return NULL;
2040 if (size == 0)
2041 return (PyObject *)unicode;
2042
2043 /* Unpack UTF-16 encoded data */
2044 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002045 q = (unsigned char *)s;
2046 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047
2048 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002049 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002051 /* Check for BOM marks (U+FEFF) in the input and adjust current
2052 byte order setting accordingly. In native mode, the leading BOM
2053 mark is skipped, in all other modes, it is copied to the output
2054 stream as-is (giving a ZWNBSP character). */
2055 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002056 if (size >= 2) {
2057 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002058#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002059 if (bom == 0xFEFF) {
2060 q += 2;
2061 bo = -1;
2062 }
2063 else if (bom == 0xFFFE) {
2064 q += 2;
2065 bo = 1;
2066 }
Tim Petersced69f82003-09-16 20:30:58 +00002067#else
Walter Dörwald69652032004-09-07 20:24:22 +00002068 if (bom == 0xFEFF) {
2069 q += 2;
2070 bo = 1;
2071 }
2072 else if (bom == 0xFFFE) {
2073 q += 2;
2074 bo = -1;
2075 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002076#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002077 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079
Tim Peters772747b2001-08-09 22:21:55 +00002080 if (bo == -1) {
2081 /* force LE */
2082 ihi = 1;
2083 ilo = 0;
2084 }
2085 else if (bo == 1) {
2086 /* force BE */
2087 ihi = 0;
2088 ilo = 1;
2089 }
2090
2091 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002092 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002093 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002094 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002095 if (consumed)
2096 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002097 errmsg = "truncated data";
2098 startinpos = ((const char *)q)-starts;
2099 endinpos = ((const char *)e)-starts;
2100 goto utf16Error;
2101 /* The remaining input chars are ignored if the callback
2102 chooses to skip the input */
2103 }
2104 ch = (q[ihi] << 8) | q[ilo];
2105
Tim Peters772747b2001-08-09 22:21:55 +00002106 q += 2;
2107
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108 if (ch < 0xD800 || ch > 0xDFFF) {
2109 *p++ = ch;
2110 continue;
2111 }
2112
2113 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002114 if (q >= e) {
2115 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002116 startinpos = (((const char *)q)-2)-starts;
2117 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002118 goto utf16Error;
2119 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002120 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002121 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2122 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002123 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002124#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002125 *p++ = ch;
2126 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002127#else
2128 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002129#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002130 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002131 }
2132 else {
2133 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002134 startinpos = (((const char *)q)-4)-starts;
2135 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002136 goto utf16Error;
2137 }
2138
Guido van Rossumd57fd912000-03-10 22:53:23 +00002139 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002140 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002141 startinpos = (((const char *)q)-2)-starts;
2142 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002143 /* Fall through to report the error */
2144
2145 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002146 outpos = p-PyUnicode_AS_UNICODE(unicode);
2147 if (unicode_decode_call_errorhandler(
2148 errors, &errorHandler,
2149 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002150 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002151 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002152 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153 }
2154
2155 if (byteorder)
2156 *byteorder = bo;
2157
Walter Dörwald69652032004-09-07 20:24:22 +00002158 if (consumed)
2159 *consumed = (const char *)q-starts;
2160
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002162 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 goto onError;
2164
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002165 Py_XDECREF(errorHandler);
2166 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 return (PyObject *)unicode;
2168
2169onError:
2170 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002171 Py_XDECREF(errorHandler);
2172 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002173 return NULL;
2174}
2175
Tim Peters772747b2001-08-09 22:21:55 +00002176PyObject *
2177PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002178 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002179 const char *errors,
2180 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181{
2182 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002183 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002184#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002185 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002186#else
2187 const int pairs = 0;
2188#endif
Tim Peters772747b2001-08-09 22:21:55 +00002189 /* Offsets from p for storing byte pairs in the right order. */
2190#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2191 int ihi = 1, ilo = 0;
2192#else
2193 int ihi = 0, ilo = 1;
2194#endif
2195
2196#define STORECHAR(CH) \
2197 do { \
2198 p[ihi] = ((CH) >> 8) & 0xff; \
2199 p[ilo] = (CH) & 0xff; \
2200 p += 2; \
2201 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002202
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002203#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002204 for (i = pairs = 0; i < size; i++)
2205 if (s[i] >= 0x10000)
2206 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002207#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002208 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002209 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210 if (v == NULL)
2211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212
Walter Dörwald3cc34522007-05-04 10:48:27 +00002213 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002215 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002216 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002217 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002218
2219 if (byteorder == -1) {
2220 /* force LE */
2221 ihi = 1;
2222 ilo = 0;
2223 }
2224 else if (byteorder == 1) {
2225 /* force BE */
2226 ihi = 0;
2227 ilo = 1;
2228 }
2229
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002230 while (size-- > 0) {
2231 Py_UNICODE ch = *s++;
2232 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002233#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002234 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002235 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2236 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002237 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002238#endif
Tim Peters772747b2001-08-09 22:21:55 +00002239 STORECHAR(ch);
2240 if (ch2)
2241 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002243 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002244#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245}
2246
2247PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2248{
2249 if (!PyUnicode_Check(unicode)) {
2250 PyErr_BadArgument();
2251 return NULL;
2252 }
2253 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2254 PyUnicode_GET_SIZE(unicode),
2255 NULL,
2256 0);
2257}
2258
2259/* --- Unicode Escape Codec ----------------------------------------------- */
2260
Fredrik Lundh06d12682001-01-24 07:59:11 +00002261static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002262
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002264 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265 const char *errors)
2266{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002267 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002268 Py_ssize_t startinpos;
2269 Py_ssize_t endinpos;
2270 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002271 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002273 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002275 char* message;
2276 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002277 PyObject *errorHandler = NULL;
2278 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002279
Guido van Rossumd57fd912000-03-10 22:53:23 +00002280 /* Escaped strings will always be longer than the resulting
2281 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002282 length after conversion to the true value.
2283 (but if the error callback returns a long replacement string
2284 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285 v = _PyUnicode_New(size);
2286 if (v == NULL)
2287 goto onError;
2288 if (size == 0)
2289 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002291 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002293
Guido van Rossumd57fd912000-03-10 22:53:23 +00002294 while (s < end) {
2295 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002296 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002297 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298
2299 /* Non-escape characters are interpreted as Unicode ordinals */
2300 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002301 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302 continue;
2303 }
2304
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002305 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306 /* \ - Escapes */
2307 s++;
2308 switch (*s++) {
2309
2310 /* \x escapes */
2311 case '\n': break;
2312 case '\\': *p++ = '\\'; break;
2313 case '\'': *p++ = '\''; break;
2314 case '\"': *p++ = '\"'; break;
2315 case 'b': *p++ = '\b'; break;
2316 case 'f': *p++ = '\014'; break; /* FF */
2317 case 't': *p++ = '\t'; break;
2318 case 'n': *p++ = '\n'; break;
2319 case 'r': *p++ = '\r'; break;
2320 case 'v': *p++ = '\013'; break; /* VT */
2321 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2322
2323 /* \OOO (octal) escapes */
2324 case '0': case '1': case '2': case '3':
2325 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002326 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002327 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002328 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002329 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002330 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002331 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002332 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002333 break;
2334
Fredrik Lundhccc74732001-02-18 22:13:49 +00002335 /* hex escapes */
2336 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002338 digits = 2;
2339 message = "truncated \\xXX escape";
2340 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002341
Fredrik Lundhccc74732001-02-18 22:13:49 +00002342 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002343 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002344 digits = 4;
2345 message = "truncated \\uXXXX escape";
2346 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002347
Fredrik Lundhccc74732001-02-18 22:13:49 +00002348 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002349 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002350 digits = 8;
2351 message = "truncated \\UXXXXXXXX escape";
2352 hexescape:
2353 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002354 outpos = p-PyUnicode_AS_UNICODE(v);
2355 if (s+digits>end) {
2356 endinpos = size;
2357 if (unicode_decode_call_errorhandler(
2358 errors, &errorHandler,
2359 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002360 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002361 (PyObject **)&v, &outpos, &p))
2362 goto onError;
2363 goto nextByte;
2364 }
2365 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002366 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002367 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002368 endinpos = (s+i+1)-starts;
2369 if (unicode_decode_call_errorhandler(
2370 errors, &errorHandler,
2371 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002372 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002373 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002374 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002375 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002376 }
2377 chr = (chr<<4) & ~0xF;
2378 if (c >= '0' && c <= '9')
2379 chr += c - '0';
2380 else if (c >= 'a' && c <= 'f')
2381 chr += 10 + c - 'a';
2382 else
2383 chr += 10 + c - 'A';
2384 }
2385 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002386 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002387 /* _decoding_error will have already written into the
2388 target buffer. */
2389 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002390 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002391 /* when we get here, chr is a 32-bit unicode character */
2392 if (chr <= 0xffff)
2393 /* UCS-2 character */
2394 *p++ = (Py_UNICODE) chr;
2395 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002396 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002397 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002398#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002399 *p++ = chr;
2400#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002401 chr -= 0x10000L;
2402 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002403 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002404#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002405 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002406 endinpos = s-starts;
2407 outpos = p-PyUnicode_AS_UNICODE(v);
2408 if (unicode_decode_call_errorhandler(
2409 errors, &errorHandler,
2410 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002411 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002412 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002413 goto onError;
2414 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002415 break;
2416
2417 /* \N{name} */
2418 case 'N':
2419 message = "malformed \\N character escape";
2420 if (ucnhash_CAPI == NULL) {
2421 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002422 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002423 m = PyImport_ImportModule("unicodedata");
2424 if (m == NULL)
2425 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002426 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002427 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002428 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002429 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002430 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002431 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002432 if (ucnhash_CAPI == NULL)
2433 goto ucnhashError;
2434 }
2435 if (*s == '{') {
2436 const char *start = s+1;
2437 /* look for the closing brace */
2438 while (*s != '}' && s < end)
2439 s++;
2440 if (s > start && s < end && *s == '}') {
2441 /* found a name. look it up in the unicode database */
2442 message = "unknown Unicode character name";
2443 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002444 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002445 goto store;
2446 }
2447 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002448 endinpos = s-starts;
2449 outpos = p-PyUnicode_AS_UNICODE(v);
2450 if (unicode_decode_call_errorhandler(
2451 errors, &errorHandler,
2452 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002453 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002454 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002455 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002456 break;
2457
2458 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002459 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002460 message = "\\ at end of string";
2461 s--;
2462 endinpos = s-starts;
2463 outpos = p-PyUnicode_AS_UNICODE(v);
2464 if (unicode_decode_call_errorhandler(
2465 errors, &errorHandler,
2466 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002467 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002468 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002469 goto onError;
2470 }
2471 else {
2472 *p++ = '\\';
2473 *p++ = (unsigned char)s[-1];
2474 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002475 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002477 nextByte:
2478 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002480 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002481 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002482 Py_XDECREF(errorHandler);
2483 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002485
Fredrik Lundhccc74732001-02-18 22:13:49 +00002486ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002487 PyErr_SetString(
2488 PyExc_UnicodeError,
2489 "\\N escapes not supported (can't load unicodedata module)"
2490 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002491 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002492 Py_XDECREF(errorHandler);
2493 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002494 return NULL;
2495
Fredrik Lundhccc74732001-02-18 22:13:49 +00002496onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002498 Py_XDECREF(errorHandler);
2499 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500 return NULL;
2501}
2502
2503/* Return a Unicode-Escape string version of the Unicode object.
2504
2505 If quotes is true, the string is enclosed in u"" or u'' quotes as
2506 appropriate.
2507
2508*/
2509
Thomas Wouters477c8d52006-05-27 19:21:47 +00002510Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2511 Py_ssize_t size,
2512 Py_UNICODE ch)
2513{
2514 /* like wcschr, but doesn't stop at NULL characters */
2515
2516 while (size-- > 0) {
2517 if (*s == ch)
2518 return s;
2519 s++;
2520 }
2521
2522 return NULL;
2523}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002524
Walter Dörwald79e913e2007-05-12 11:08:06 +00002525static const char *hexdigits = "0123456789abcdef";
2526
2527PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2528 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529{
2530 PyObject *repr;
2531 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532
Thomas Wouters89f507f2006-12-13 04:49:30 +00002533 /* XXX(nnorwitz): rather than over-allocating, it would be
2534 better to choose a different scheme. Perhaps scan the
2535 first N-chars of the string and allocate based on that size.
2536 */
2537 /* Initial allocation is based on the longest-possible unichr
2538 escape.
2539
2540 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2541 unichr, so in this case it's the longest unichr escape. In
2542 narrow (UTF-16) builds this is five chars per source unichr
2543 since there are two unichrs in the surrogate pair, so in narrow
2544 (UTF-16) builds it's not the longest unichr escape.
2545
2546 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2547 so in the narrow (UTF-16) build case it's the longest unichr
2548 escape.
2549 */
2550
Walter Dörwald79e913e2007-05-12 11:08:06 +00002551 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002552#ifdef Py_UNICODE_WIDE
2553 + 10*size
2554#else
2555 + 6*size
2556#endif
2557 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002558 if (repr == NULL)
2559 return NULL;
2560
Walter Dörwald79e913e2007-05-12 11:08:06 +00002561 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563 while (size-- > 0) {
2564 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002565
Walter Dörwald79e913e2007-05-12 11:08:06 +00002566 /* Escape backslashes */
2567 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568 *p++ = '\\';
2569 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002570 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002571 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002572
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002573#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002574 /* Map 21-bit characters to '\U00xxxxxx' */
2575 else if (ch >= 0x10000) {
2576 *p++ = '\\';
2577 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002578 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2579 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2580 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2581 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2582 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2583 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2584 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2585 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002586 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002587 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002588#else
2589 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002590 else if (ch >= 0xD800 && ch < 0xDC00) {
2591 Py_UNICODE ch2;
2592 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002593
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002594 ch2 = *s++;
2595 size--;
2596 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2597 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2598 *p++ = '\\';
2599 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002600 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2601 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2602 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2603 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2604 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2605 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2606 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2607 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002608 continue;
2609 }
2610 /* Fall through: isolated surrogates are copied as-is */
2611 s--;
2612 size++;
2613 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002614#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002615
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002617 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618 *p++ = '\\';
2619 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002620 *p++ = hexdigits[(ch >> 12) & 0x000F];
2621 *p++ = hexdigits[(ch >> 8) & 0x000F];
2622 *p++ = hexdigits[(ch >> 4) & 0x000F];
2623 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002625
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002626 /* Map special whitespace to '\t', \n', '\r' */
2627 else if (ch == '\t') {
2628 *p++ = '\\';
2629 *p++ = 't';
2630 }
2631 else if (ch == '\n') {
2632 *p++ = '\\';
2633 *p++ = 'n';
2634 }
2635 else if (ch == '\r') {
2636 *p++ = '\\';
2637 *p++ = 'r';
2638 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002639
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002640 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002641 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002642 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002643 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002644 *p++ = hexdigits[(ch >> 4) & 0x000F];
2645 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002646 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002647
Guido van Rossumd57fd912000-03-10 22:53:23 +00002648 /* Copy everything else as-is */
2649 else
2650 *p++ = (char) ch;
2651 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002652
2653 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002654 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2655 Py_DECREF(repr);
2656 return NULL;
2657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002658 return repr;
2659}
2660
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2662{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002663 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664 if (!PyUnicode_Check(unicode)) {
2665 PyErr_BadArgument();
2666 return NULL;
2667 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002668 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2669 PyUnicode_GET_SIZE(unicode));
2670
2671 if (!s)
2672 return NULL;
2673 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2674 PyBytes_GET_SIZE(s));
2675 Py_DECREF(s);
2676 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677}
2678
2679/* --- Raw Unicode Escape Codec ------------------------------------------- */
2680
2681PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002682 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683 const char *errors)
2684{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002685 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002686 Py_ssize_t startinpos;
2687 Py_ssize_t endinpos;
2688 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002689 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002690 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002691 const char *end;
2692 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002693 PyObject *errorHandler = NULL;
2694 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002695
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 /* Escaped strings will always be longer than the resulting
2697 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002698 length after conversion to the true value. (But decoding error
2699 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700 v = _PyUnicode_New(size);
2701 if (v == NULL)
2702 goto onError;
2703 if (size == 0)
2704 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002705 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 end = s + size;
2707 while (s < end) {
2708 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002709 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002711 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712
2713 /* Non-escape characters are interpreted as Unicode ordinals */
2714 if (*s != '\\') {
2715 *p++ = (unsigned char)*s++;
2716 continue;
2717 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002718 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002719
2720 /* \u-escapes are only interpreted iff the number of leading
2721 backslashes if odd */
2722 bs = s;
2723 for (;s < end;) {
2724 if (*s != '\\')
2725 break;
2726 *p++ = (unsigned char)*s++;
2727 }
2728 if (((s - bs) & 1) == 0 ||
2729 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002730 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 continue;
2732 }
2733 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002734 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 s++;
2736
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002737 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002739 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002740 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742 endinpos = s-starts;
2743 if (unicode_decode_call_errorhandler(
2744 errors, &errorHandler,
2745 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002746 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002747 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002748 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002749 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 }
2751 x = (x<<4) & ~0xF;
2752 if (c >= '0' && c <= '9')
2753 x += c - '0';
2754 else if (c >= 'a' && c <= 'f')
2755 x += 10 + c - 'a';
2756 else
2757 x += 10 + c - 'A';
2758 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002759#ifndef Py_UNICODE_WIDE
2760 if (x > 0x10000) {
2761 if (unicode_decode_call_errorhandler(
2762 errors, &errorHandler,
2763 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002764 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002765 (PyObject **)&v, &outpos, &p))
2766 goto onError;
2767 }
2768#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002769 *p++ = x;
2770 nextByte:
2771 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002773 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002774 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002775 Py_XDECREF(errorHandler);
2776 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002778
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779 onError:
2780 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002781 Py_XDECREF(errorHandler);
2782 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783 return NULL;
2784}
2785
2786PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002787 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788{
2789 PyObject *repr;
2790 char *p;
2791 char *q;
2792
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002793#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002794 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002795#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002796 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002797#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 if (repr == NULL)
2799 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002800 if (size == 0)
2801 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802
Walter Dörwald711005d2007-05-12 12:03:26 +00002803 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804 while (size-- > 0) {
2805 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002806#ifdef Py_UNICODE_WIDE
2807 /* Map 32-bit characters to '\Uxxxxxxxx' */
2808 if (ch >= 0x10000) {
2809 *p++ = '\\';
2810 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002811 *p++ = hexdigits[(ch >> 28) & 0xf];
2812 *p++ = hexdigits[(ch >> 24) & 0xf];
2813 *p++ = hexdigits[(ch >> 20) & 0xf];
2814 *p++ = hexdigits[(ch >> 16) & 0xf];
2815 *p++ = hexdigits[(ch >> 12) & 0xf];
2816 *p++ = hexdigits[(ch >> 8) & 0xf];
2817 *p++ = hexdigits[(ch >> 4) & 0xf];
2818 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002819 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002820 else
2821#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822 /* Map 16-bit characters to '\uxxxx' */
2823 if (ch >= 256) {
2824 *p++ = '\\';
2825 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002826 *p++ = hexdigits[(ch >> 12) & 0xf];
2827 *p++ = hexdigits[(ch >> 8) & 0xf];
2828 *p++ = hexdigits[(ch >> 4) & 0xf];
2829 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830 }
2831 /* Copy everything else as-is */
2832 else
2833 *p++ = (char) ch;
2834 }
2835 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002836 if (PyBytes_Resize(repr, p - q)) {
2837 Py_DECREF(repr);
2838 return NULL;
2839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840 return repr;
2841}
2842
2843PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2844{
Walter Dörwald711005d2007-05-12 12:03:26 +00002845 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002847 PyErr_BadArgument();
2848 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002850 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2851 PyUnicode_GET_SIZE(unicode));
2852
2853 if (!s)
2854 return NULL;
2855 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2856 PyBytes_GET_SIZE(s));
2857 Py_DECREF(s);
2858 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859}
2860
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002861/* --- Unicode Internal Codec ------------------------------------------- */
2862
2863PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002864 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002865 const char *errors)
2866{
2867 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002868 Py_ssize_t startinpos;
2869 Py_ssize_t endinpos;
2870 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002871 PyUnicodeObject *v;
2872 Py_UNICODE *p;
2873 const char *end;
2874 const char *reason;
2875 PyObject *errorHandler = NULL;
2876 PyObject *exc = NULL;
2877
Neal Norwitzd43069c2006-01-08 01:12:10 +00002878#ifdef Py_UNICODE_WIDE
2879 Py_UNICODE unimax = PyUnicode_GetMax();
2880#endif
2881
Thomas Wouters89f507f2006-12-13 04:49:30 +00002882 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002883 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2884 if (v == NULL)
2885 goto onError;
2886 if (PyUnicode_GetSize((PyObject *)v) == 0)
2887 return (PyObject *)v;
2888 p = PyUnicode_AS_UNICODE(v);
2889 end = s + size;
2890
2891 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002892 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002893 /* We have to sanity check the raw data, otherwise doom looms for
2894 some malformed UCS-4 data. */
2895 if (
2896 #ifdef Py_UNICODE_WIDE
2897 *p > unimax || *p < 0 ||
2898 #endif
2899 end-s < Py_UNICODE_SIZE
2900 )
2901 {
2902 startinpos = s - starts;
2903 if (end-s < Py_UNICODE_SIZE) {
2904 endinpos = end-starts;
2905 reason = "truncated input";
2906 }
2907 else {
2908 endinpos = s - starts + Py_UNICODE_SIZE;
2909 reason = "illegal code point (> 0x10FFFF)";
2910 }
2911 outpos = p - PyUnicode_AS_UNICODE(v);
2912 if (unicode_decode_call_errorhandler(
2913 errors, &errorHandler,
2914 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002915 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002916 (PyObject **)&v, &outpos, &p)) {
2917 goto onError;
2918 }
2919 }
2920 else {
2921 p++;
2922 s += Py_UNICODE_SIZE;
2923 }
2924 }
2925
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002926 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002927 goto onError;
2928 Py_XDECREF(errorHandler);
2929 Py_XDECREF(exc);
2930 return (PyObject *)v;
2931
2932 onError:
2933 Py_XDECREF(v);
2934 Py_XDECREF(errorHandler);
2935 Py_XDECREF(exc);
2936 return NULL;
2937}
2938
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939/* --- Latin-1 Codec ------------------------------------------------------ */
2940
2941PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002942 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943 const char *errors)
2944{
2945 PyUnicodeObject *v;
2946 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002947
Guido van Rossumd57fd912000-03-10 22:53:23 +00002948 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002949 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002950 Py_UNICODE r = *(unsigned char*)s;
2951 return PyUnicode_FromUnicode(&r, 1);
2952 }
2953
Guido van Rossumd57fd912000-03-10 22:53:23 +00002954 v = _PyUnicode_New(size);
2955 if (v == NULL)
2956 goto onError;
2957 if (size == 0)
2958 return (PyObject *)v;
2959 p = PyUnicode_AS_UNICODE(v);
2960 while (size-- > 0)
2961 *p++ = (unsigned char)*s++;
2962 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002963
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964 onError:
2965 Py_XDECREF(v);
2966 return NULL;
2967}
2968
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002969/* create or adjust a UnicodeEncodeError */
2970static void make_encode_exception(PyObject **exceptionObject,
2971 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002972 const Py_UNICODE *unicode, Py_ssize_t size,
2973 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002974 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002976 if (*exceptionObject == NULL) {
2977 *exceptionObject = PyUnicodeEncodeError_Create(
2978 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 }
2980 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002981 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2982 goto onError;
2983 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2984 goto onError;
2985 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2986 goto onError;
2987 return;
2988 onError:
2989 Py_DECREF(*exceptionObject);
2990 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991 }
2992}
2993
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002994/* raises a UnicodeEncodeError */
2995static void raise_encode_exception(PyObject **exceptionObject,
2996 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002997 const Py_UNICODE *unicode, Py_ssize_t size,
2998 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002999 const char *reason)
3000{
3001 make_encode_exception(exceptionObject,
3002 encoding, unicode, size, startpos, endpos, reason);
3003 if (*exceptionObject != NULL)
3004 PyCodec_StrictErrors(*exceptionObject);
3005}
3006
3007/* error handling callback helper:
3008 build arguments, call the callback and check the arguments,
3009 put the result into newpos and return the replacement string, which
3010 has to be freed by the caller */
3011static PyObject *unicode_encode_call_errorhandler(const char *errors,
3012 PyObject **errorHandler,
3013 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003014 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3015 Py_ssize_t startpos, Py_ssize_t endpos,
3016 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003017{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003018 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003019
3020 PyObject *restuple;
3021 PyObject *resunicode;
3022
3023 if (*errorHandler == NULL) {
3024 *errorHandler = PyCodec_LookupError(errors);
3025 if (*errorHandler == NULL)
3026 return NULL;
3027 }
3028
3029 make_encode_exception(exceptionObject,
3030 encoding, unicode, size, startpos, endpos, reason);
3031 if (*exceptionObject == NULL)
3032 return NULL;
3033
3034 restuple = PyObject_CallFunctionObjArgs(
3035 *errorHandler, *exceptionObject, NULL);
3036 if (restuple == NULL)
3037 return NULL;
3038 if (!PyTuple_Check(restuple)) {
3039 PyErr_Format(PyExc_TypeError, &argparse[4]);
3040 Py_DECREF(restuple);
3041 return NULL;
3042 }
3043 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3044 &resunicode, newpos)) {
3045 Py_DECREF(restuple);
3046 return NULL;
3047 }
3048 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003049 *newpos = size+*newpos;
3050 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003051 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003052 Py_DECREF(restuple);
3053 return NULL;
3054 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003055 Py_INCREF(resunicode);
3056 Py_DECREF(restuple);
3057 return resunicode;
3058}
3059
3060static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003061 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003062 const char *errors,
3063 int limit)
3064{
3065 /* output object */
3066 PyObject *res;
3067 /* pointers to the beginning and end+1 of input */
3068 const Py_UNICODE *startp = p;
3069 const Py_UNICODE *endp = p + size;
3070 /* pointer to the beginning of the unencodable characters */
3071 /* const Py_UNICODE *badp = NULL; */
3072 /* pointer into the output */
3073 char *str;
3074 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003075 Py_ssize_t respos = 0;
3076 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003077 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3078 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003079 PyObject *errorHandler = NULL;
3080 PyObject *exc = NULL;
3081 /* the following variable is used for caching string comparisons
3082 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3083 int known_errorHandler = -1;
3084
3085 /* allocate enough for a simple encoding without
3086 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003087 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003088 if (res == NULL)
3089 goto onError;
3090 if (size == 0)
3091 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003092 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003093 ressize = size;
3094
3095 while (p<endp) {
3096 Py_UNICODE c = *p;
3097
3098 /* can we encode this? */
3099 if (c<limit) {
3100 /* no overflow check, because we know that the space is enough */
3101 *str++ = (char)c;
3102 ++p;
3103 }
3104 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003105 Py_ssize_t unicodepos = p-startp;
3106 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003107 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003108 Py_ssize_t repsize;
3109 Py_ssize_t newpos;
3110 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003111 Py_UNICODE *uni2;
3112 /* startpos for collecting unencodable chars */
3113 const Py_UNICODE *collstart = p;
3114 const Py_UNICODE *collend = p;
3115 /* find all unecodable characters */
3116 while ((collend < endp) && ((*collend)>=limit))
3117 ++collend;
3118 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3119 if (known_errorHandler==-1) {
3120 if ((errors==NULL) || (!strcmp(errors, "strict")))
3121 known_errorHandler = 1;
3122 else if (!strcmp(errors, "replace"))
3123 known_errorHandler = 2;
3124 else if (!strcmp(errors, "ignore"))
3125 known_errorHandler = 3;
3126 else if (!strcmp(errors, "xmlcharrefreplace"))
3127 known_errorHandler = 4;
3128 else
3129 known_errorHandler = 0;
3130 }
3131 switch (known_errorHandler) {
3132 case 1: /* strict */
3133 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3134 goto onError;
3135 case 2: /* replace */
3136 while (collstart++<collend)
3137 *str++ = '?'; /* fall through */
3138 case 3: /* ignore */
3139 p = collend;
3140 break;
3141 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003142 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003143 /* determine replacement size (temporarily (mis)uses p) */
3144 for (p = collstart, repsize = 0; p < collend; ++p) {
3145 if (*p<10)
3146 repsize += 2+1+1;
3147 else if (*p<100)
3148 repsize += 2+2+1;
3149 else if (*p<1000)
3150 repsize += 2+3+1;
3151 else if (*p<10000)
3152 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003153#ifndef Py_UNICODE_WIDE
3154 else
3155 repsize += 2+5+1;
3156#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003157 else if (*p<100000)
3158 repsize += 2+5+1;
3159 else if (*p<1000000)
3160 repsize += 2+6+1;
3161 else
3162 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003163#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003164 }
3165 requiredsize = respos+repsize+(endp-collend);
3166 if (requiredsize > ressize) {
3167 if (requiredsize<2*ressize)
3168 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003169 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003170 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003171 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003172 ressize = requiredsize;
3173 }
3174 /* generate replacement (temporarily (mis)uses p) */
3175 for (p = collstart; p < collend; ++p) {
3176 str += sprintf(str, "&#%d;", (int)*p);
3177 }
3178 p = collend;
3179 break;
3180 default:
3181 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3182 encoding, reason, startp, size, &exc,
3183 collstart-startp, collend-startp, &newpos);
3184 if (repunicode == NULL)
3185 goto onError;
3186 /* need more space? (at least enough for what we
3187 have+the replacement+the rest of the string, so
3188 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003189 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003190 repsize = PyUnicode_GET_SIZE(repunicode);
3191 requiredsize = respos+repsize+(endp-collend);
3192 if (requiredsize > ressize) {
3193 if (requiredsize<2*ressize)
3194 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003195 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003196 Py_DECREF(repunicode);
3197 goto onError;
3198 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003199 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003200 ressize = requiredsize;
3201 }
3202 /* check if there is anything unencodable in the replacement
3203 and copy it to the output */
3204 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3205 c = *uni2;
3206 if (c >= limit) {
3207 raise_encode_exception(&exc, encoding, startp, size,
3208 unicodepos, unicodepos+1, reason);
3209 Py_DECREF(repunicode);
3210 goto onError;
3211 }
3212 *str = (char)c;
3213 }
3214 p = startp + newpos;
3215 Py_DECREF(repunicode);
3216 }
3217 }
3218 }
3219 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003220 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003221 if (respos<ressize)
3222 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003223 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003224 Py_XDECREF(errorHandler);
3225 Py_XDECREF(exc);
3226 return res;
3227
3228 onError:
3229 Py_XDECREF(res);
3230 Py_XDECREF(errorHandler);
3231 Py_XDECREF(exc);
3232 return NULL;
3233}
3234
Guido van Rossumd57fd912000-03-10 22:53:23 +00003235PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003236 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 const char *errors)
3238{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003239 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240}
3241
3242PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3243{
3244 if (!PyUnicode_Check(unicode)) {
3245 PyErr_BadArgument();
3246 return NULL;
3247 }
3248 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3249 PyUnicode_GET_SIZE(unicode),
3250 NULL);
3251}
3252
3253/* --- 7-bit ASCII Codec -------------------------------------------------- */
3254
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003256 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 const char *errors)
3258{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003259 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 PyUnicodeObject *v;
3261 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003262 Py_ssize_t startinpos;
3263 Py_ssize_t endinpos;
3264 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003265 const char *e;
3266 PyObject *errorHandler = NULL;
3267 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003268
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003270 if (size == 1 && *(unsigned char*)s < 128) {
3271 Py_UNICODE r = *(unsigned char*)s;
3272 return PyUnicode_FromUnicode(&r, 1);
3273 }
Tim Petersced69f82003-09-16 20:30:58 +00003274
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275 v = _PyUnicode_New(size);
3276 if (v == NULL)
3277 goto onError;
3278 if (size == 0)
3279 return (PyObject *)v;
3280 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003281 e = s + size;
3282 while (s < e) {
3283 register unsigned char c = (unsigned char)*s;
3284 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003285 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003286 ++s;
3287 }
3288 else {
3289 startinpos = s-starts;
3290 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003291 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003292 if (unicode_decode_call_errorhandler(
3293 errors, &errorHandler,
3294 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003295 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003298 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003300 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003301 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003302 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003303 Py_XDECREF(errorHandler);
3304 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003306
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307 onError:
3308 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003309 Py_XDECREF(errorHandler);
3310 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311 return NULL;
3312}
3313
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003315 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316 const char *errors)
3317{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003318 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003319}
3320
3321PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3322{
3323 if (!PyUnicode_Check(unicode)) {
3324 PyErr_BadArgument();
3325 return NULL;
3326 }
3327 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3328 PyUnicode_GET_SIZE(unicode),
3329 NULL);
3330}
3331
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003332#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003333
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003334/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003335
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003336#if SIZEOF_INT < SIZEOF_SSIZE_T
3337#define NEED_RETRY
3338#endif
3339
3340/* XXX This code is limited to "true" double-byte encodings, as
3341 a) it assumes an incomplete character consists of a single byte, and
3342 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3343 encodings, see IsDBCSLeadByteEx documentation. */
3344
3345static int is_dbcs_lead_byte(const char *s, int offset)
3346{
3347 const char *curr = s + offset;
3348
3349 if (IsDBCSLeadByte(*curr)) {
3350 const char *prev = CharPrev(s, curr);
3351 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3352 }
3353 return 0;
3354}
3355
3356/*
3357 * Decode MBCS string into unicode object. If 'final' is set, converts
3358 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3359 */
3360static int decode_mbcs(PyUnicodeObject **v,
3361 const char *s, /* MBCS string */
3362 int size, /* sizeof MBCS string */
3363 int final)
3364{
3365 Py_UNICODE *p;
3366 Py_ssize_t n = 0;
3367 int usize = 0;
3368
3369 assert(size >= 0);
3370
3371 /* Skip trailing lead-byte unless 'final' is set */
3372 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3373 --size;
3374
3375 /* First get the size of the result */
3376 if (size > 0) {
3377 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3378 if (usize == 0) {
3379 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3380 return -1;
3381 }
3382 }
3383
3384 if (*v == NULL) {
3385 /* Create unicode object */
3386 *v = _PyUnicode_New(usize);
3387 if (*v == NULL)
3388 return -1;
3389 }
3390 else {
3391 /* Extend unicode object */
3392 n = PyUnicode_GET_SIZE(*v);
3393 if (_PyUnicode_Resize(v, n + usize) < 0)
3394 return -1;
3395 }
3396
3397 /* Do the conversion */
3398 if (size > 0) {
3399 p = PyUnicode_AS_UNICODE(*v) + n;
3400 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3401 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3402 return -1;
3403 }
3404 }
3405
3406 return size;
3407}
3408
3409PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3410 Py_ssize_t size,
3411 const char *errors,
3412 Py_ssize_t *consumed)
3413{
3414 PyUnicodeObject *v = NULL;
3415 int done;
3416
3417 if (consumed)
3418 *consumed = 0;
3419
3420#ifdef NEED_RETRY
3421 retry:
3422 if (size > INT_MAX)
3423 done = decode_mbcs(&v, s, INT_MAX, 0);
3424 else
3425#endif
3426 done = decode_mbcs(&v, s, (int)size, !consumed);
3427
3428 if (done < 0) {
3429 Py_XDECREF(v);
3430 return NULL;
3431 }
3432
3433 if (consumed)
3434 *consumed += done;
3435
3436#ifdef NEED_RETRY
3437 if (size > INT_MAX) {
3438 s += done;
3439 size -= done;
3440 goto retry;
3441 }
3442#endif
3443
3444 return (PyObject *)v;
3445}
3446
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003447PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003448 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003449 const char *errors)
3450{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003451 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3452}
3453
3454/*
3455 * Convert unicode into string object (MBCS).
3456 * Returns 0 if succeed, -1 otherwise.
3457 */
3458static int encode_mbcs(PyObject **repr,
3459 const Py_UNICODE *p, /* unicode */
3460 int size) /* size of unicode */
3461{
3462 int mbcssize = 0;
3463 Py_ssize_t n = 0;
3464
3465 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003466
3467 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003468 if (size > 0) {
3469 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3470 if (mbcssize == 0) {
3471 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3472 return -1;
3473 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003474 }
3475
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003476 if (*repr == NULL) {
3477 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003478 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003479 if (*repr == NULL)
3480 return -1;
3481 }
3482 else {
3483 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003484 n = PyBytes_Size(*repr);
3485 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003486 return -1;
3487 }
3488
3489 /* Do the conversion */
3490 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003491 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003492 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3493 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3494 return -1;
3495 }
3496 }
3497
3498 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003499}
3500
3501PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003502 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003503 const char *errors)
3504{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003505 PyObject *repr = NULL;
3506 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003507
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003508#ifdef NEED_RETRY
3509 retry:
3510 if (size > INT_MAX)
3511 ret = encode_mbcs(&repr, p, INT_MAX);
3512 else
3513#endif
3514 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003515
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003516 if (ret < 0) {
3517 Py_XDECREF(repr);
3518 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003519 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003520
3521#ifdef NEED_RETRY
3522 if (size > INT_MAX) {
3523 p += INT_MAX;
3524 size -= INT_MAX;
3525 goto retry;
3526 }
3527#endif
3528
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003529 return repr;
3530}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003531
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003532PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3533{
3534 if (!PyUnicode_Check(unicode)) {
3535 PyErr_BadArgument();
3536 return NULL;
3537 }
3538 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3539 PyUnicode_GET_SIZE(unicode),
3540 NULL);
3541}
3542
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003543#undef NEED_RETRY
3544
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003545#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003546
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547/* --- Character Mapping Codec -------------------------------------------- */
3548
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003550 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551 PyObject *mapping,
3552 const char *errors)
3553{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003555 Py_ssize_t startinpos;
3556 Py_ssize_t endinpos;
3557 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559 PyUnicodeObject *v;
3560 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003561 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562 PyObject *errorHandler = NULL;
3563 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003564 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003565 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003566
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567 /* Default to Latin-1 */
3568 if (mapping == NULL)
3569 return PyUnicode_DecodeLatin1(s, size, errors);
3570
3571 v = _PyUnicode_New(size);
3572 if (v == NULL)
3573 goto onError;
3574 if (size == 0)
3575 return (PyObject *)v;
3576 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003577 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003578 if (PyUnicode_CheckExact(mapping)) {
3579 mapstring = PyUnicode_AS_UNICODE(mapping);
3580 maplen = PyUnicode_GET_SIZE(mapping);
3581 while (s < e) {
3582 unsigned char ch = *s;
3583 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003585 if (ch < maplen)
3586 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003588 if (x == 0xfffe) {
3589 /* undefined mapping */
3590 outpos = p-PyUnicode_AS_UNICODE(v);
3591 startinpos = s-starts;
3592 endinpos = startinpos+1;
3593 if (unicode_decode_call_errorhandler(
3594 errors, &errorHandler,
3595 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003596 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003597 (PyObject **)&v, &outpos, &p)) {
3598 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003599 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003600 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003601 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003602 *p++ = x;
3603 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003605 }
3606 else {
3607 while (s < e) {
3608 unsigned char ch = *s;
3609 PyObject *w, *x;
3610
3611 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3612 w = PyInt_FromLong((long)ch);
3613 if (w == NULL)
3614 goto onError;
3615 x = PyObject_GetItem(mapping, w);
3616 Py_DECREF(w);
3617 if (x == NULL) {
3618 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3619 /* No mapping found means: mapping is undefined. */
3620 PyErr_Clear();
3621 x = Py_None;
3622 Py_INCREF(x);
3623 } else
3624 goto onError;
3625 }
3626
3627 /* Apply mapping */
3628 if (PyInt_Check(x)) {
3629 long value = PyInt_AS_LONG(x);
3630 if (value < 0 || value > 65535) {
3631 PyErr_SetString(PyExc_TypeError,
3632 "character mapping must be in range(65536)");
3633 Py_DECREF(x);
3634 goto onError;
3635 }
3636 *p++ = (Py_UNICODE)value;
3637 }
3638 else if (x == Py_None) {
3639 /* undefined mapping */
3640 outpos = p-PyUnicode_AS_UNICODE(v);
3641 startinpos = s-starts;
3642 endinpos = startinpos+1;
3643 if (unicode_decode_call_errorhandler(
3644 errors, &errorHandler,
3645 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003646 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003647 (PyObject **)&v, &outpos, &p)) {
3648 Py_DECREF(x);
3649 goto onError;
3650 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003651 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003652 continue;
3653 }
3654 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003655 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003656
3657 if (targetsize == 1)
3658 /* 1-1 mapping */
3659 *p++ = *PyUnicode_AS_UNICODE(x);
3660
3661 else if (targetsize > 1) {
3662 /* 1-n mapping */
3663 if (targetsize > extrachars) {
3664 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003665 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3666 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003667 (targetsize << 2);
3668 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003669 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003670 if (_PyUnicode_Resize(&v,
3671 PyUnicode_GET_SIZE(v) + needed) < 0) {
3672 Py_DECREF(x);
3673 goto onError;
3674 }
3675 p = PyUnicode_AS_UNICODE(v) + oldpos;
3676 }
3677 Py_UNICODE_COPY(p,
3678 PyUnicode_AS_UNICODE(x),
3679 targetsize);
3680 p += targetsize;
3681 extrachars -= targetsize;
3682 }
3683 /* 1-0 mapping: skip the character */
3684 }
3685 else {
3686 /* wrong return value */
3687 PyErr_SetString(PyExc_TypeError,
3688 "character mapping must return integer, None or unicode");
3689 Py_DECREF(x);
3690 goto onError;
3691 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003693 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 }
3696 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003697 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003699 Py_XDECREF(errorHandler);
3700 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003702
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003704 Py_XDECREF(errorHandler);
3705 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706 Py_XDECREF(v);
3707 return NULL;
3708}
3709
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003710/* Charmap encoding: the lookup table */
3711
3712struct encoding_map{
3713 PyObject_HEAD
3714 unsigned char level1[32];
3715 int count2, count3;
3716 unsigned char level23[1];
3717};
3718
3719static PyObject*
3720encoding_map_size(PyObject *obj, PyObject* args)
3721{
3722 struct encoding_map *map = (struct encoding_map*)obj;
3723 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3724 128*map->count3);
3725}
3726
3727static PyMethodDef encoding_map_methods[] = {
3728 {"size", encoding_map_size, METH_NOARGS,
3729 PyDoc_STR("Return the size (in bytes) of this object") },
3730 { 0 }
3731};
3732
3733static void
3734encoding_map_dealloc(PyObject* o)
3735{
3736 PyObject_FREE(o);
3737}
3738
3739static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003740 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003741 "EncodingMap", /*tp_name*/
3742 sizeof(struct encoding_map), /*tp_basicsize*/
3743 0, /*tp_itemsize*/
3744 /* methods */
3745 encoding_map_dealloc, /*tp_dealloc*/
3746 0, /*tp_print*/
3747 0, /*tp_getattr*/
3748 0, /*tp_setattr*/
3749 0, /*tp_compare*/
3750 0, /*tp_repr*/
3751 0, /*tp_as_number*/
3752 0, /*tp_as_sequence*/
3753 0, /*tp_as_mapping*/
3754 0, /*tp_hash*/
3755 0, /*tp_call*/
3756 0, /*tp_str*/
3757 0, /*tp_getattro*/
3758 0, /*tp_setattro*/
3759 0, /*tp_as_buffer*/
3760 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3761 0, /*tp_doc*/
3762 0, /*tp_traverse*/
3763 0, /*tp_clear*/
3764 0, /*tp_richcompare*/
3765 0, /*tp_weaklistoffset*/
3766 0, /*tp_iter*/
3767 0, /*tp_iternext*/
3768 encoding_map_methods, /*tp_methods*/
3769 0, /*tp_members*/
3770 0, /*tp_getset*/
3771 0, /*tp_base*/
3772 0, /*tp_dict*/
3773 0, /*tp_descr_get*/
3774 0, /*tp_descr_set*/
3775 0, /*tp_dictoffset*/
3776 0, /*tp_init*/
3777 0, /*tp_alloc*/
3778 0, /*tp_new*/
3779 0, /*tp_free*/
3780 0, /*tp_is_gc*/
3781};
3782
3783PyObject*
3784PyUnicode_BuildEncodingMap(PyObject* string)
3785{
3786 Py_UNICODE *decode;
3787 PyObject *result;
3788 struct encoding_map *mresult;
3789 int i;
3790 int need_dict = 0;
3791 unsigned char level1[32];
3792 unsigned char level2[512];
3793 unsigned char *mlevel1, *mlevel2, *mlevel3;
3794 int count2 = 0, count3 = 0;
3795
3796 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3797 PyErr_BadArgument();
3798 return NULL;
3799 }
3800 decode = PyUnicode_AS_UNICODE(string);
3801 memset(level1, 0xFF, sizeof level1);
3802 memset(level2, 0xFF, sizeof level2);
3803
3804 /* If there isn't a one-to-one mapping of NULL to \0,
3805 or if there are non-BMP characters, we need to use
3806 a mapping dictionary. */
3807 if (decode[0] != 0)
3808 need_dict = 1;
3809 for (i = 1; i < 256; i++) {
3810 int l1, l2;
3811 if (decode[i] == 0
3812 #ifdef Py_UNICODE_WIDE
3813 || decode[i] > 0xFFFF
3814 #endif
3815 ) {
3816 need_dict = 1;
3817 break;
3818 }
3819 if (decode[i] == 0xFFFE)
3820 /* unmapped character */
3821 continue;
3822 l1 = decode[i] >> 11;
3823 l2 = decode[i] >> 7;
3824 if (level1[l1] == 0xFF)
3825 level1[l1] = count2++;
3826 if (level2[l2] == 0xFF)
3827 level2[l2] = count3++;
3828 }
3829
3830 if (count2 >= 0xFF || count3 >= 0xFF)
3831 need_dict = 1;
3832
3833 if (need_dict) {
3834 PyObject *result = PyDict_New();
3835 PyObject *key, *value;
3836 if (!result)
3837 return NULL;
3838 for (i = 0; i < 256; i++) {
3839 key = value = NULL;
3840 key = PyInt_FromLong(decode[i]);
3841 value = PyInt_FromLong(i);
3842 if (!key || !value)
3843 goto failed1;
3844 if (PyDict_SetItem(result, key, value) == -1)
3845 goto failed1;
3846 Py_DECREF(key);
3847 Py_DECREF(value);
3848 }
3849 return result;
3850 failed1:
3851 Py_XDECREF(key);
3852 Py_XDECREF(value);
3853 Py_DECREF(result);
3854 return NULL;
3855 }
3856
3857 /* Create a three-level trie */
3858 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3859 16*count2 + 128*count3 - 1);
3860 if (!result)
3861 return PyErr_NoMemory();
3862 PyObject_Init(result, &EncodingMapType);
3863 mresult = (struct encoding_map*)result;
3864 mresult->count2 = count2;
3865 mresult->count3 = count3;
3866 mlevel1 = mresult->level1;
3867 mlevel2 = mresult->level23;
3868 mlevel3 = mresult->level23 + 16*count2;
3869 memcpy(mlevel1, level1, 32);
3870 memset(mlevel2, 0xFF, 16*count2);
3871 memset(mlevel3, 0, 128*count3);
3872 count3 = 0;
3873 for (i = 1; i < 256; i++) {
3874 int o1, o2, o3, i2, i3;
3875 if (decode[i] == 0xFFFE)
3876 /* unmapped character */
3877 continue;
3878 o1 = decode[i]>>11;
3879 o2 = (decode[i]>>7) & 0xF;
3880 i2 = 16*mlevel1[o1] + o2;
3881 if (mlevel2[i2] == 0xFF)
3882 mlevel2[i2] = count3++;
3883 o3 = decode[i] & 0x7F;
3884 i3 = 128*mlevel2[i2] + o3;
3885 mlevel3[i3] = i;
3886 }
3887 return result;
3888}
3889
3890static int
3891encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3892{
3893 struct encoding_map *map = (struct encoding_map*)mapping;
3894 int l1 = c>>11;
3895 int l2 = (c>>7) & 0xF;
3896 int l3 = c & 0x7F;
3897 int i;
3898
3899#ifdef Py_UNICODE_WIDE
3900 if (c > 0xFFFF) {
3901 return -1;
3902 }
3903#endif
3904 if (c == 0)
3905 return 0;
3906 /* level 1*/
3907 i = map->level1[l1];
3908 if (i == 0xFF) {
3909 return -1;
3910 }
3911 /* level 2*/
3912 i = map->level23[16*i+l2];
3913 if (i == 0xFF) {
3914 return -1;
3915 }
3916 /* level 3 */
3917 i = map->level23[16*map->count2 + 128*i + l3];
3918 if (i == 0) {
3919 return -1;
3920 }
3921 return i;
3922}
3923
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003924/* Lookup the character ch in the mapping. If the character
3925 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003926 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003927static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 PyObject *w = PyInt_FromLong((long)c);
3930 PyObject *x;
3931
3932 if (w == NULL)
3933 return NULL;
3934 x = PyObject_GetItem(mapping, w);
3935 Py_DECREF(w);
3936 if (x == NULL) {
3937 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3938 /* No mapping found means: mapping is undefined. */
3939 PyErr_Clear();
3940 x = Py_None;
3941 Py_INCREF(x);
3942 return x;
3943 } else
3944 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003946 else if (x == Py_None)
3947 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003948 else if (PyInt_Check(x)) {
3949 long value = PyInt_AS_LONG(x);
3950 if (value < 0 || value > 255) {
3951 PyErr_SetString(PyExc_TypeError,
3952 "character mapping must be in range(256)");
3953 Py_DECREF(x);
3954 return NULL;
3955 }
3956 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958 else if (PyString_Check(x))
3959 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003962 PyErr_Format(PyExc_TypeError,
3963 "character mapping must return integer, None or str8, not %.400s",
3964 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003965 Py_DECREF(x);
3966 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967 }
3968}
3969
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003970static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003971charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003972{
Walter Dörwald827b0552007-05-12 13:23:53 +00003973 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003974 /* exponentially overallocate to minimize reallocations */
3975 if (requiredsize < 2*outsize)
3976 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003977 if (PyBytes_Resize(outobj, requiredsize)) {
3978 Py_DECREF(outobj);
3979 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003980 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003981 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003982}
3983
3984typedef enum charmapencode_result {
3985 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3986}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003987/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003988 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003989 space is available. Return a new reference to the object that
3990 was put in the output buffer, or Py_None, if the mapping was undefined
3991 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003992 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003993static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003994charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003995 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003996{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003997 PyObject *rep;
3998 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00003999 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004001 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004002 int res = encoding_map_lookup(c, mapping);
4003 Py_ssize_t requiredsize = *outpos+1;
4004 if (res == -1)
4005 return enc_FAILED;
4006 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004007 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004008 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004009 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004010 outstart[(*outpos)++] = (char)res;
4011 return enc_SUCCESS;
4012 }
4013
4014 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004015 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004016 return enc_EXCEPTION;
4017 else if (rep==Py_None) {
4018 Py_DECREF(rep);
4019 return enc_FAILED;
4020 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004021 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004022 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004023 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004024 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004026 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004028 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004029 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4030 }
4031 else {
4032 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004033 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4034 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004035 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004036 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004038 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004040 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004041 memcpy(outstart + *outpos, repchars, repsize);
4042 *outpos += repsize;
4043 }
4044 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004045 Py_DECREF(rep);
4046 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047}
4048
4049/* handle an error in PyUnicode_EncodeCharmap
4050 Return 0 on success, -1 on error */
4051static
4052int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004053 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004055 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004056 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004057{
4058 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004059 Py_ssize_t repsize;
4060 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061 Py_UNICODE *uni2;
4062 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004063 Py_ssize_t collstartpos = *inpos;
4064 Py_ssize_t collendpos = *inpos+1;
4065 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004066 char *encoding = "charmap";
4067 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004068 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004069
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004070 /* find all unencodable characters */
4071 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004072 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004073 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004074 int res = encoding_map_lookup(p[collendpos], mapping);
4075 if (res != -1)
4076 break;
4077 ++collendpos;
4078 continue;
4079 }
4080
4081 rep = charmapencode_lookup(p[collendpos], mapping);
4082 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004084 else if (rep!=Py_None) {
4085 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004086 break;
4087 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004088 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089 ++collendpos;
4090 }
4091 /* cache callback name lookup
4092 * (if not done yet, i.e. it's the first error) */
4093 if (*known_errorHandler==-1) {
4094 if ((errors==NULL) || (!strcmp(errors, "strict")))
4095 *known_errorHandler = 1;
4096 else if (!strcmp(errors, "replace"))
4097 *known_errorHandler = 2;
4098 else if (!strcmp(errors, "ignore"))
4099 *known_errorHandler = 3;
4100 else if (!strcmp(errors, "xmlcharrefreplace"))
4101 *known_errorHandler = 4;
4102 else
4103 *known_errorHandler = 0;
4104 }
4105 switch (*known_errorHandler) {
4106 case 1: /* strict */
4107 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4108 return -1;
4109 case 2: /* replace */
4110 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4111 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004112 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004113 return -1;
4114 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004115 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004116 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4117 return -1;
4118 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 }
4120 /* fall through */
4121 case 3: /* ignore */
4122 *inpos = collendpos;
4123 break;
4124 case 4: /* xmlcharrefreplace */
4125 /* generate replacement (temporarily (mis)uses p) */
4126 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4127 char buffer[2+29+1+1];
4128 char *cp;
4129 sprintf(buffer, "&#%d;", (int)p[collpos]);
4130 for (cp = buffer; *cp; ++cp) {
4131 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004132 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004133 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004134 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4136 return -1;
4137 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 }
4139 }
4140 *inpos = collendpos;
4141 break;
4142 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004143 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004144 encoding, reason, p, size, exceptionObject,
4145 collstartpos, collendpos, &newpos);
4146 if (repunicode == NULL)
4147 return -1;
4148 /* generate replacement */
4149 repsize = PyUnicode_GET_SIZE(repunicode);
4150 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4151 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004152 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004153 return -1;
4154 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004155 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4158 return -1;
4159 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160 }
4161 *inpos = newpos;
4162 Py_DECREF(repunicode);
4163 }
4164 return 0;
4165}
4166
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004168 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169 PyObject *mapping,
4170 const char *errors)
4171{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172 /* output object */
4173 PyObject *res = NULL;
4174 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004175 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004177 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178 PyObject *errorHandler = NULL;
4179 PyObject *exc = NULL;
4180 /* the following variable is used for caching string comparisons
4181 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4182 * 3=ignore, 4=xmlcharrefreplace */
4183 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184
4185 /* Default to Latin-1 */
4186 if (mapping == NULL)
4187 return PyUnicode_EncodeLatin1(p, size, errors);
4188
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 /* allocate enough for a simple encoding without
4190 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004191 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 if (res == NULL)
4193 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004194 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004196
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197 while (inpos<size) {
4198 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004199 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004200 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004202 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203 if (charmap_encoding_error(p, size, &inpos, mapping,
4204 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004205 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004206 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004207 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004210 else
4211 /* done with this character => adjust input position */
4212 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004215 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004216 if (respos<PyBytes_GET_SIZE(res)) {
4217 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004218 goto onError;
4219 }
4220 Py_XDECREF(exc);
4221 Py_XDECREF(errorHandler);
4222 return res;
4223
4224 onError:
4225 Py_XDECREF(res);
4226 Py_XDECREF(exc);
4227 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 return NULL;
4229}
4230
4231PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4232 PyObject *mapping)
4233{
4234 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4235 PyErr_BadArgument();
4236 return NULL;
4237 }
4238 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4239 PyUnicode_GET_SIZE(unicode),
4240 mapping,
4241 NULL);
4242}
4243
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244/* create or adjust a UnicodeTranslateError */
4245static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004246 const Py_UNICODE *unicode, Py_ssize_t size,
4247 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004250 if (*exceptionObject == NULL) {
4251 *exceptionObject = PyUnicodeTranslateError_Create(
4252 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253 }
4254 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004255 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4256 goto onError;
4257 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4258 goto onError;
4259 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4260 goto onError;
4261 return;
4262 onError:
4263 Py_DECREF(*exceptionObject);
4264 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004265 }
4266}
4267
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268/* raises a UnicodeTranslateError */
4269static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004270 const Py_UNICODE *unicode, Py_ssize_t size,
4271 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272 const char *reason)
4273{
4274 make_translate_exception(exceptionObject,
4275 unicode, size, startpos, endpos, reason);
4276 if (*exceptionObject != NULL)
4277 PyCodec_StrictErrors(*exceptionObject);
4278}
4279
4280/* error handling callback helper:
4281 build arguments, call the callback and check the arguments,
4282 put the result into newpos and return the replacement string, which
4283 has to be freed by the caller */
4284static PyObject *unicode_translate_call_errorhandler(const char *errors,
4285 PyObject **errorHandler,
4286 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004287 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4288 Py_ssize_t startpos, Py_ssize_t endpos,
4289 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004291 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004292
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004293 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294 PyObject *restuple;
4295 PyObject *resunicode;
4296
4297 if (*errorHandler == NULL) {
4298 *errorHandler = PyCodec_LookupError(errors);
4299 if (*errorHandler == NULL)
4300 return NULL;
4301 }
4302
4303 make_translate_exception(exceptionObject,
4304 unicode, size, startpos, endpos, reason);
4305 if (*exceptionObject == NULL)
4306 return NULL;
4307
4308 restuple = PyObject_CallFunctionObjArgs(
4309 *errorHandler, *exceptionObject, NULL);
4310 if (restuple == NULL)
4311 return NULL;
4312 if (!PyTuple_Check(restuple)) {
4313 PyErr_Format(PyExc_TypeError, &argparse[4]);
4314 Py_DECREF(restuple);
4315 return NULL;
4316 }
4317 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004318 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 Py_DECREF(restuple);
4320 return NULL;
4321 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004322 if (i_newpos<0)
4323 *newpos = size+i_newpos;
4324 else
4325 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004326 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004327 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004328 Py_DECREF(restuple);
4329 return NULL;
4330 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004331 Py_INCREF(resunicode);
4332 Py_DECREF(restuple);
4333 return resunicode;
4334}
4335
4336/* Lookup the character ch in the mapping and put the result in result,
4337 which must be decrefed by the caller.
4338 Return 0 on success, -1 on error */
4339static
4340int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4341{
4342 PyObject *w = PyInt_FromLong((long)c);
4343 PyObject *x;
4344
4345 if (w == NULL)
4346 return -1;
4347 x = PyObject_GetItem(mapping, w);
4348 Py_DECREF(w);
4349 if (x == NULL) {
4350 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4351 /* No mapping found means: use 1:1 mapping. */
4352 PyErr_Clear();
4353 *result = NULL;
4354 return 0;
4355 } else
4356 return -1;
4357 }
4358 else if (x == Py_None) {
4359 *result = x;
4360 return 0;
4361 }
4362 else if (PyInt_Check(x)) {
4363 long value = PyInt_AS_LONG(x);
4364 long max = PyUnicode_GetMax();
4365 if (value < 0 || value > max) {
4366 PyErr_Format(PyExc_TypeError,
4367 "character mapping must be in range(0x%lx)", max+1);
4368 Py_DECREF(x);
4369 return -1;
4370 }
4371 *result = x;
4372 return 0;
4373 }
4374 else if (PyUnicode_Check(x)) {
4375 *result = x;
4376 return 0;
4377 }
4378 else {
4379 /* wrong return value */
4380 PyErr_SetString(PyExc_TypeError,
4381 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004382 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004383 return -1;
4384 }
4385}
4386/* ensure that *outobj is at least requiredsize characters long,
4387if not reallocate and adjust various state variables.
4388Return 0 on success, -1 on error */
4389static
Walter Dörwald4894c302003-10-24 14:25:28 +00004390int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004391 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004392{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004393 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004394 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004395 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004396 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004397 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004398 if (requiredsize < 2 * oldsize)
4399 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004400 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004401 return -1;
4402 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403 }
4404 return 0;
4405}
4406/* lookup the character, put the result in the output string and adjust
4407 various state variables. Return a new reference to the object that
4408 was put in the output buffer in *result, or Py_None, if the mapping was
4409 undefined (in which case no character was written).
4410 The called must decref result.
4411 Return 0 on success, -1 on error. */
4412static
Walter Dörwald4894c302003-10-24 14:25:28 +00004413int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004414 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004415 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004416{
Walter Dörwald4894c302003-10-24 14:25:28 +00004417 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 return -1;
4419 if (*res==NULL) {
4420 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004421 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004422 }
4423 else if (*res==Py_None)
4424 ;
4425 else if (PyInt_Check(*res)) {
4426 /* no overflow check, because we know that the space is enough */
4427 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4428 }
4429 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004430 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431 if (repsize==1) {
4432 /* no overflow check, because we know that the space is enough */
4433 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4434 }
4435 else if (repsize!=0) {
4436 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004437 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004438 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004439 repsize - 1;
4440 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 return -1;
4442 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4443 *outp += repsize;
4444 }
4445 }
4446 else
4447 return -1;
4448 return 0;
4449}
4450
4451PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004452 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453 PyObject *mapping,
4454 const char *errors)
4455{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004456 /* output object */
4457 PyObject *res = NULL;
4458 /* pointers to the beginning and end+1 of input */
4459 const Py_UNICODE *startp = p;
4460 const Py_UNICODE *endp = p + size;
4461 /* pointer into the output */
4462 Py_UNICODE *str;
4463 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004464 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 char *reason = "character maps to <undefined>";
4466 PyObject *errorHandler = NULL;
4467 PyObject *exc = NULL;
4468 /* the following variable is used for caching string comparisons
4469 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4470 * 3=ignore, 4=xmlcharrefreplace */
4471 int known_errorHandler = -1;
4472
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473 if (mapping == NULL) {
4474 PyErr_BadArgument();
4475 return NULL;
4476 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004477
4478 /* allocate enough for a simple 1:1 translation without
4479 replacements, if we need more, we'll resize */
4480 res = PyUnicode_FromUnicode(NULL, size);
4481 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004482 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004483 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004484 return res;
4485 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004487 while (p<endp) {
4488 /* try to encode it */
4489 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004490 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004491 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492 goto onError;
4493 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004494 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 if (x!=Py_None) /* it worked => adjust input pointer */
4496 ++p;
4497 else { /* untranslatable character */
4498 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004499 Py_ssize_t repsize;
4500 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501 Py_UNICODE *uni2;
4502 /* startpos for collecting untranslatable chars */
4503 const Py_UNICODE *collstart = p;
4504 const Py_UNICODE *collend = p+1;
4505 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004507 /* find all untranslatable characters */
4508 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004509 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 goto onError;
4511 Py_XDECREF(x);
4512 if (x!=Py_None)
4513 break;
4514 ++collend;
4515 }
4516 /* cache callback name lookup
4517 * (if not done yet, i.e. it's the first error) */
4518 if (known_errorHandler==-1) {
4519 if ((errors==NULL) || (!strcmp(errors, "strict")))
4520 known_errorHandler = 1;
4521 else if (!strcmp(errors, "replace"))
4522 known_errorHandler = 2;
4523 else if (!strcmp(errors, "ignore"))
4524 known_errorHandler = 3;
4525 else if (!strcmp(errors, "xmlcharrefreplace"))
4526 known_errorHandler = 4;
4527 else
4528 known_errorHandler = 0;
4529 }
4530 switch (known_errorHandler) {
4531 case 1: /* strict */
4532 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4533 goto onError;
4534 case 2: /* replace */
4535 /* No need to check for space, this is a 1:1 replacement */
4536 for (coll = collstart; coll<collend; ++coll)
4537 *str++ = '?';
4538 /* fall through */
4539 case 3: /* ignore */
4540 p = collend;
4541 break;
4542 case 4: /* xmlcharrefreplace */
4543 /* generate replacement (temporarily (mis)uses p) */
4544 for (p = collstart; p < collend; ++p) {
4545 char buffer[2+29+1+1];
4546 char *cp;
4547 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004548 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4550 goto onError;
4551 for (cp = buffer; *cp; ++cp)
4552 *str++ = *cp;
4553 }
4554 p = collend;
4555 break;
4556 default:
4557 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4558 reason, startp, size, &exc,
4559 collstart-startp, collend-startp, &newpos);
4560 if (repunicode == NULL)
4561 goto onError;
4562 /* generate replacement */
4563 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004564 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4566 Py_DECREF(repunicode);
4567 goto onError;
4568 }
4569 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4570 *str++ = *uni2;
4571 p = startp + newpos;
4572 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573 }
4574 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 /* Resize if we allocated to much */
4577 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004578 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004579 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004580 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581 }
4582 Py_XDECREF(exc);
4583 Py_XDECREF(errorHandler);
4584 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004586 onError:
4587 Py_XDECREF(res);
4588 Py_XDECREF(exc);
4589 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590 return NULL;
4591}
4592
4593PyObject *PyUnicode_Translate(PyObject *str,
4594 PyObject *mapping,
4595 const char *errors)
4596{
4597 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004598
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599 str = PyUnicode_FromObject(str);
4600 if (str == NULL)
4601 goto onError;
4602 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4603 PyUnicode_GET_SIZE(str),
4604 mapping,
4605 errors);
4606 Py_DECREF(str);
4607 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004608
Guido van Rossumd57fd912000-03-10 22:53:23 +00004609 onError:
4610 Py_XDECREF(str);
4611 return NULL;
4612}
Tim Petersced69f82003-09-16 20:30:58 +00004613
Guido van Rossum9e896b32000-04-05 20:11:21 +00004614/* --- Decimal Encoder ---------------------------------------------------- */
4615
4616int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004617 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004618 char *output,
4619 const char *errors)
4620{
4621 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004622 PyObject *errorHandler = NULL;
4623 PyObject *exc = NULL;
4624 const char *encoding = "decimal";
4625 const char *reason = "invalid decimal Unicode string";
4626 /* the following variable is used for caching string comparisons
4627 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4628 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004629
4630 if (output == NULL) {
4631 PyErr_BadArgument();
4632 return -1;
4633 }
4634
4635 p = s;
4636 end = s + length;
4637 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004638 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004639 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004641 Py_ssize_t repsize;
4642 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004643 Py_UNICODE *uni2;
4644 Py_UNICODE *collstart;
4645 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004646
Guido van Rossum9e896b32000-04-05 20:11:21 +00004647 if (Py_UNICODE_ISSPACE(ch)) {
4648 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004650 continue;
4651 }
4652 decimal = Py_UNICODE_TODECIMAL(ch);
4653 if (decimal >= 0) {
4654 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004656 continue;
4657 }
Guido van Rossumba477042000-04-06 18:18:10 +00004658 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004659 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004660 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004661 continue;
4662 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004663 /* All other characters are considered unencodable */
4664 collstart = p;
4665 collend = p+1;
4666 while (collend < end) {
4667 if ((0 < *collend && *collend < 256) ||
4668 !Py_UNICODE_ISSPACE(*collend) ||
4669 Py_UNICODE_TODECIMAL(*collend))
4670 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004671 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004672 /* cache callback name lookup
4673 * (if not done yet, i.e. it's the first error) */
4674 if (known_errorHandler==-1) {
4675 if ((errors==NULL) || (!strcmp(errors, "strict")))
4676 known_errorHandler = 1;
4677 else if (!strcmp(errors, "replace"))
4678 known_errorHandler = 2;
4679 else if (!strcmp(errors, "ignore"))
4680 known_errorHandler = 3;
4681 else if (!strcmp(errors, "xmlcharrefreplace"))
4682 known_errorHandler = 4;
4683 else
4684 known_errorHandler = 0;
4685 }
4686 switch (known_errorHandler) {
4687 case 1: /* strict */
4688 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4689 goto onError;
4690 case 2: /* replace */
4691 for (p = collstart; p < collend; ++p)
4692 *output++ = '?';
4693 /* fall through */
4694 case 3: /* ignore */
4695 p = collend;
4696 break;
4697 case 4: /* xmlcharrefreplace */
4698 /* generate replacement (temporarily (mis)uses p) */
4699 for (p = collstart; p < collend; ++p)
4700 output += sprintf(output, "&#%d;", (int)*p);
4701 p = collend;
4702 break;
4703 default:
4704 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4705 encoding, reason, s, length, &exc,
4706 collstart-s, collend-s, &newpos);
4707 if (repunicode == NULL)
4708 goto onError;
4709 /* generate replacement */
4710 repsize = PyUnicode_GET_SIZE(repunicode);
4711 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4712 Py_UNICODE ch = *uni2;
4713 if (Py_UNICODE_ISSPACE(ch))
4714 *output++ = ' ';
4715 else {
4716 decimal = Py_UNICODE_TODECIMAL(ch);
4717 if (decimal >= 0)
4718 *output++ = '0' + decimal;
4719 else if (0 < ch && ch < 256)
4720 *output++ = (char)ch;
4721 else {
4722 Py_DECREF(repunicode);
4723 raise_encode_exception(&exc, encoding,
4724 s, length, collstart-s, collend-s, reason);
4725 goto onError;
4726 }
4727 }
4728 }
4729 p = s + newpos;
4730 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004731 }
4732 }
4733 /* 0-terminate the output string */
4734 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004735 Py_XDECREF(exc);
4736 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004737 return 0;
4738
4739 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004740 Py_XDECREF(exc);
4741 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004742 return -1;
4743}
4744
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745/* --- Helpers ------------------------------------------------------------ */
4746
Thomas Wouters477c8d52006-05-27 19:21:47 +00004747#define STRINGLIB_CHAR Py_UNICODE
4748
4749#define STRINGLIB_LEN PyUnicode_GET_SIZE
4750#define STRINGLIB_NEW PyUnicode_FromUnicode
4751#define STRINGLIB_STR PyUnicode_AS_UNICODE
4752
4753Py_LOCAL_INLINE(int)
4754STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004756 if (str[0] != other[0])
4757 return 1;
4758 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759}
4760
Thomas Wouters477c8d52006-05-27 19:21:47 +00004761#define STRINGLIB_EMPTY unicode_empty
4762
4763#include "stringlib/fastsearch.h"
4764
4765#include "stringlib/count.h"
4766#include "stringlib/find.h"
4767#include "stringlib/partition.h"
4768
4769/* helper macro to fixup start/end slice values */
4770#define FIX_START_END(obj) \
4771 if (start < 0) \
4772 start += (obj)->length; \
4773 if (start < 0) \
4774 start = 0; \
4775 if (end > (obj)->length) \
4776 end = (obj)->length; \
4777 if (end < 0) \
4778 end += (obj)->length; \
4779 if (end < 0) \
4780 end = 0;
4781
Martin v. Löwis18e16552006-02-15 17:27:45 +00004782Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004783 PyObject *substr,
4784 Py_ssize_t start,
4785 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004787 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004788 PyUnicodeObject* str_obj;
4789 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004790
Thomas Wouters477c8d52006-05-27 19:21:47 +00004791 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4792 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004794 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4795 if (!sub_obj) {
4796 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797 return -1;
4798 }
Tim Petersced69f82003-09-16 20:30:58 +00004799
Thomas Wouters477c8d52006-05-27 19:21:47 +00004800 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004801
Thomas Wouters477c8d52006-05-27 19:21:47 +00004802 result = stringlib_count(
4803 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4804 );
4805
4806 Py_DECREF(sub_obj);
4807 Py_DECREF(str_obj);
4808
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809 return result;
4810}
4811
Martin v. Löwis18e16552006-02-15 17:27:45 +00004812Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004813 PyObject *sub,
4814 Py_ssize_t start,
4815 Py_ssize_t end,
4816 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004818 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004819
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004821 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004822 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004823 sub = PyUnicode_FromObject(sub);
4824 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004825 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004826 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827 }
Tim Petersced69f82003-09-16 20:30:58 +00004828
Thomas Wouters477c8d52006-05-27 19:21:47 +00004829 if (direction > 0)
4830 result = stringlib_find_slice(
4831 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4832 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4833 start, end
4834 );
4835 else
4836 result = stringlib_rfind_slice(
4837 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4838 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4839 start, end
4840 );
4841
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004843 Py_DECREF(sub);
4844
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845 return result;
4846}
4847
Tim Petersced69f82003-09-16 20:30:58 +00004848static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849int tailmatch(PyUnicodeObject *self,
4850 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004851 Py_ssize_t start,
4852 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853 int direction)
4854{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 if (substring->length == 0)
4856 return 1;
4857
Thomas Wouters477c8d52006-05-27 19:21:47 +00004858 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859
4860 end -= substring->length;
4861 if (end < start)
4862 return 0;
4863
4864 if (direction > 0) {
4865 if (Py_UNICODE_MATCH(self, end, substring))
4866 return 1;
4867 } else {
4868 if (Py_UNICODE_MATCH(self, start, substring))
4869 return 1;
4870 }
4871
4872 return 0;
4873}
4874
Martin v. Löwis18e16552006-02-15 17:27:45 +00004875Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004877 Py_ssize_t start,
4878 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879 int direction)
4880{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004881 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004882
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883 str = PyUnicode_FromObject(str);
4884 if (str == NULL)
4885 return -1;
4886 substr = PyUnicode_FromObject(substr);
4887 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004888 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889 return -1;
4890 }
Tim Petersced69f82003-09-16 20:30:58 +00004891
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 result = tailmatch((PyUnicodeObject *)str,
4893 (PyUnicodeObject *)substr,
4894 start, end, direction);
4895 Py_DECREF(str);
4896 Py_DECREF(substr);
4897 return result;
4898}
4899
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900/* Apply fixfct filter to the Unicode object self and return a
4901 reference to the modified object */
4902
Tim Petersced69f82003-09-16 20:30:58 +00004903static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904PyObject *fixup(PyUnicodeObject *self,
4905 int (*fixfct)(PyUnicodeObject *s))
4906{
4907
4908 PyUnicodeObject *u;
4909
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004910 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911 if (u == NULL)
4912 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004913
4914 Py_UNICODE_COPY(u->str, self->str, self->length);
4915
Tim Peters7a29bd52001-09-12 03:03:31 +00004916 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917 /* fixfct should return TRUE if it modified the buffer. If
4918 FALSE, return a reference to the original buffer instead
4919 (to save space, not time) */
4920 Py_INCREF(self);
4921 Py_DECREF(u);
4922 return (PyObject*) self;
4923 }
4924 return (PyObject*) u;
4925}
4926
Tim Petersced69f82003-09-16 20:30:58 +00004927static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928int fixupper(PyUnicodeObject *self)
4929{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004930 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931 Py_UNICODE *s = self->str;
4932 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004933
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 while (len-- > 0) {
4935 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004936
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 ch = Py_UNICODE_TOUPPER(*s);
4938 if (ch != *s) {
4939 status = 1;
4940 *s = ch;
4941 }
4942 s++;
4943 }
4944
4945 return status;
4946}
4947
Tim Petersced69f82003-09-16 20:30:58 +00004948static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949int fixlower(PyUnicodeObject *self)
4950{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004951 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952 Py_UNICODE *s = self->str;
4953 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004954
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955 while (len-- > 0) {
4956 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004957
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958 ch = Py_UNICODE_TOLOWER(*s);
4959 if (ch != *s) {
4960 status = 1;
4961 *s = ch;
4962 }
4963 s++;
4964 }
4965
4966 return status;
4967}
4968
Tim Petersced69f82003-09-16 20:30:58 +00004969static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970int fixswapcase(PyUnicodeObject *self)
4971{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004972 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973 Py_UNICODE *s = self->str;
4974 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004975
Guido van Rossumd57fd912000-03-10 22:53:23 +00004976 while (len-- > 0) {
4977 if (Py_UNICODE_ISUPPER(*s)) {
4978 *s = Py_UNICODE_TOLOWER(*s);
4979 status = 1;
4980 } else if (Py_UNICODE_ISLOWER(*s)) {
4981 *s = Py_UNICODE_TOUPPER(*s);
4982 status = 1;
4983 }
4984 s++;
4985 }
4986
4987 return status;
4988}
4989
Tim Petersced69f82003-09-16 20:30:58 +00004990static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991int fixcapitalize(PyUnicodeObject *self)
4992{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004993 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004994 Py_UNICODE *s = self->str;
4995 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004996
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004997 if (len == 0)
4998 return 0;
4999 if (Py_UNICODE_ISLOWER(*s)) {
5000 *s = Py_UNICODE_TOUPPER(*s);
5001 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005003 s++;
5004 while (--len > 0) {
5005 if (Py_UNICODE_ISUPPER(*s)) {
5006 *s = Py_UNICODE_TOLOWER(*s);
5007 status = 1;
5008 }
5009 s++;
5010 }
5011 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012}
5013
5014static
5015int fixtitle(PyUnicodeObject *self)
5016{
5017 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5018 register Py_UNICODE *e;
5019 int previous_is_cased;
5020
5021 /* Shortcut for single character strings */
5022 if (PyUnicode_GET_SIZE(self) == 1) {
5023 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5024 if (*p != ch) {
5025 *p = ch;
5026 return 1;
5027 }
5028 else
5029 return 0;
5030 }
Tim Petersced69f82003-09-16 20:30:58 +00005031
Guido van Rossumd57fd912000-03-10 22:53:23 +00005032 e = p + PyUnicode_GET_SIZE(self);
5033 previous_is_cased = 0;
5034 for (; p < e; p++) {
5035 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005036
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037 if (previous_is_cased)
5038 *p = Py_UNICODE_TOLOWER(ch);
5039 else
5040 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005041
5042 if (Py_UNICODE_ISLOWER(ch) ||
5043 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044 Py_UNICODE_ISTITLE(ch))
5045 previous_is_cased = 1;
5046 else
5047 previous_is_cased = 0;
5048 }
5049 return 1;
5050}
5051
Tim Peters8ce9f162004-08-27 01:49:32 +00005052PyObject *
5053PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054{
Tim Peters8ce9f162004-08-27 01:49:32 +00005055 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005056 const Py_UNICODE blank = ' ';
5057 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005058 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005059 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005060 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5061 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005062 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5063 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005064 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005065 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005066 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067
Tim Peters05eba1f2004-08-27 21:32:02 +00005068 fseq = PySequence_Fast(seq, "");
5069 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005070 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005071 }
5072
Tim Peters91879ab2004-08-27 22:35:44 +00005073 /* Grrrr. A codec may be invoked to convert str objects to
5074 * Unicode, and so it's possible to call back into Python code
5075 * during PyUnicode_FromObject(), and so it's possible for a sick
5076 * codec to change the size of fseq (if seq is a list). Therefore
5077 * we have to keep refetching the size -- can't assume seqlen
5078 * is invariant.
5079 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005080 seqlen = PySequence_Fast_GET_SIZE(fseq);
5081 /* If empty sequence, return u"". */
5082 if (seqlen == 0) {
5083 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5084 goto Done;
5085 }
5086 /* If singleton sequence with an exact Unicode, return that. */
5087 if (seqlen == 1) {
5088 item = PySequence_Fast_GET_ITEM(fseq, 0);
5089 if (PyUnicode_CheckExact(item)) {
5090 Py_INCREF(item);
5091 res = (PyUnicodeObject *)item;
5092 goto Done;
5093 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005094 }
5095
Tim Peters05eba1f2004-08-27 21:32:02 +00005096 /* At least two items to join, or one that isn't exact Unicode. */
5097 if (seqlen > 1) {
5098 /* Set up sep and seplen -- they're needed. */
5099 if (separator == NULL) {
5100 sep = &blank;
5101 seplen = 1;
5102 }
5103 else {
5104 internal_separator = PyUnicode_FromObject(separator);
5105 if (internal_separator == NULL)
5106 goto onError;
5107 sep = PyUnicode_AS_UNICODE(internal_separator);
5108 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005109 /* In case PyUnicode_FromObject() mutated seq. */
5110 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005111 }
5112 }
5113
5114 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005115 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005116 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005117 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005118 res_p = PyUnicode_AS_UNICODE(res);
5119 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005120
Tim Peters05eba1f2004-08-27 21:32:02 +00005121 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005122 Py_ssize_t itemlen;
5123 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005124
5125 item = PySequence_Fast_GET_ITEM(fseq, i);
5126 /* Convert item to Unicode. */
5127 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5128 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005129 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005130 " %.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00005131 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005132 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005133 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005134 item = PyUnicode_FromObject(item);
5135 if (item == NULL)
5136 goto onError;
5137 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005138
Tim Peters91879ab2004-08-27 22:35:44 +00005139 /* In case PyUnicode_FromObject() mutated seq. */
5140 seqlen = PySequence_Fast_GET_SIZE(fseq);
5141
Tim Peters8ce9f162004-08-27 01:49:32 +00005142 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005144 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005145 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005146 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005147 if (i < seqlen - 1) {
5148 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005149 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005150 goto Overflow;
5151 }
5152 if (new_res_used > res_alloc) {
5153 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005154 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005155 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005156 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005157 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005158 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005159 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005160 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005162 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005163 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005165
5166 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005167 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005168 res_p += itemlen;
5169 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005170 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005171 res_p += seplen;
5172 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005174 res_used = new_res_used;
5175 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005176
Tim Peters05eba1f2004-08-27 21:32:02 +00005177 /* Shrink res to match the used area; this probably can't fail,
5178 * but it's cheap to check.
5179 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005180 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005181 goto onError;
5182
5183 Done:
5184 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005185 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186 return (PyObject *)res;
5187
Tim Peters8ce9f162004-08-27 01:49:32 +00005188 Overflow:
5189 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005190 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005191 Py_DECREF(item);
5192 /* fall through */
5193
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005195 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005196 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005197 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198 return NULL;
5199}
5200
Tim Petersced69f82003-09-16 20:30:58 +00005201static
5202PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005203 Py_ssize_t left,
5204 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 Py_UNICODE fill)
5206{
5207 PyUnicodeObject *u;
5208
5209 if (left < 0)
5210 left = 0;
5211 if (right < 0)
5212 right = 0;
5213
Tim Peters7a29bd52001-09-12 03:03:31 +00005214 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215 Py_INCREF(self);
5216 return self;
5217 }
5218
5219 u = _PyUnicode_New(left + self->length + right);
5220 if (u) {
5221 if (left)
5222 Py_UNICODE_FILL(u->str, fill, left);
5223 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5224 if (right)
5225 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5226 }
5227
5228 return u;
5229}
5230
5231#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005232 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233 if (!str) \
5234 goto onError; \
5235 if (PyList_Append(list, str)) { \
5236 Py_DECREF(str); \
5237 goto onError; \
5238 } \
5239 else \
5240 Py_DECREF(str);
5241
5242static
5243PyObject *split_whitespace(PyUnicodeObject *self,
5244 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005245 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005247 register Py_ssize_t i;
5248 register Py_ssize_t j;
5249 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250 PyObject *str;
5251
5252 for (i = j = 0; i < len; ) {
5253 /* find a token */
5254 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5255 i++;
5256 j = i;
5257 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5258 i++;
5259 if (j < i) {
5260 if (maxcount-- <= 0)
5261 break;
5262 SPLIT_APPEND(self->str, j, i);
5263 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5264 i++;
5265 j = i;
5266 }
5267 }
5268 if (j < len) {
5269 SPLIT_APPEND(self->str, j, len);
5270 }
5271 return list;
5272
5273 onError:
5274 Py_DECREF(list);
5275 return NULL;
5276}
5277
5278PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005279 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005281 register Py_ssize_t i;
5282 register Py_ssize_t j;
5283 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284 PyObject *list;
5285 PyObject *str;
5286 Py_UNICODE *data;
5287
5288 string = PyUnicode_FromObject(string);
5289 if (string == NULL)
5290 return NULL;
5291 data = PyUnicode_AS_UNICODE(string);
5292 len = PyUnicode_GET_SIZE(string);
5293
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294 list = PyList_New(0);
5295 if (!list)
5296 goto onError;
5297
5298 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005299 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005300
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005302 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304
5305 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005306 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 if (i < len) {
5308 if (data[i] == '\r' && i + 1 < len &&
5309 data[i+1] == '\n')
5310 i += 2;
5311 else
5312 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005313 if (keepends)
5314 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315 }
Guido van Rossum86662912000-04-11 15:38:46 +00005316 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317 j = i;
5318 }
5319 if (j < len) {
5320 SPLIT_APPEND(data, j, len);
5321 }
5322
5323 Py_DECREF(string);
5324 return list;
5325
5326 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005327 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328 Py_DECREF(string);
5329 return NULL;
5330}
5331
Tim Petersced69f82003-09-16 20:30:58 +00005332static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333PyObject *split_char(PyUnicodeObject *self,
5334 PyObject *list,
5335 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005336 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005338 register Py_ssize_t i;
5339 register Py_ssize_t j;
5340 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 PyObject *str;
5342
5343 for (i = j = 0; i < len; ) {
5344 if (self->str[i] == ch) {
5345 if (maxcount-- <= 0)
5346 break;
5347 SPLIT_APPEND(self->str, j, i);
5348 i = j = i + 1;
5349 } else
5350 i++;
5351 }
5352 if (j <= len) {
5353 SPLIT_APPEND(self->str, j, len);
5354 }
5355 return list;
5356
5357 onError:
5358 Py_DECREF(list);
5359 return NULL;
5360}
5361
Tim Petersced69f82003-09-16 20:30:58 +00005362static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363PyObject *split_substring(PyUnicodeObject *self,
5364 PyObject *list,
5365 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005366 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005368 register Py_ssize_t i;
5369 register Py_ssize_t j;
5370 Py_ssize_t len = self->length;
5371 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 PyObject *str;
5373
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005374 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 if (Py_UNICODE_MATCH(self, i, substring)) {
5376 if (maxcount-- <= 0)
5377 break;
5378 SPLIT_APPEND(self->str, j, i);
5379 i = j = i + sublen;
5380 } else
5381 i++;
5382 }
5383 if (j <= len) {
5384 SPLIT_APPEND(self->str, j, len);
5385 }
5386 return list;
5387
5388 onError:
5389 Py_DECREF(list);
5390 return NULL;
5391}
5392
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005393static
5394PyObject *rsplit_whitespace(PyUnicodeObject *self,
5395 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005396 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005397{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005398 register Py_ssize_t i;
5399 register Py_ssize_t j;
5400 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005401 PyObject *str;
5402
5403 for (i = j = len - 1; i >= 0; ) {
5404 /* find a token */
5405 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5406 i--;
5407 j = i;
5408 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5409 i--;
5410 if (j > i) {
5411 if (maxcount-- <= 0)
5412 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005413 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005414 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5415 i--;
5416 j = i;
5417 }
5418 }
5419 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005420 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005421 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005422 if (PyList_Reverse(list) < 0)
5423 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005424 return list;
5425
5426 onError:
5427 Py_DECREF(list);
5428 return NULL;
5429}
5430
5431static
5432PyObject *rsplit_char(PyUnicodeObject *self,
5433 PyObject *list,
5434 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005435 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005436{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005437 register Py_ssize_t i;
5438 register Py_ssize_t j;
5439 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005440 PyObject *str;
5441
5442 for (i = j = len - 1; i >= 0; ) {
5443 if (self->str[i] == ch) {
5444 if (maxcount-- <= 0)
5445 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005446 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005447 j = i = i - 1;
5448 } else
5449 i--;
5450 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005451 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005452 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005453 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005454 if (PyList_Reverse(list) < 0)
5455 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005456 return list;
5457
5458 onError:
5459 Py_DECREF(list);
5460 return NULL;
5461}
5462
5463static
5464PyObject *rsplit_substring(PyUnicodeObject *self,
5465 PyObject *list,
5466 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005467 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005468{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005469 register Py_ssize_t i;
5470 register Py_ssize_t j;
5471 Py_ssize_t len = self->length;
5472 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005473 PyObject *str;
5474
5475 for (i = len - sublen, j = len; i >= 0; ) {
5476 if (Py_UNICODE_MATCH(self, i, substring)) {
5477 if (maxcount-- <= 0)
5478 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005479 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005480 j = i;
5481 i -= sublen;
5482 } else
5483 i--;
5484 }
5485 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005486 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005487 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005488 if (PyList_Reverse(list) < 0)
5489 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005490 return list;
5491
5492 onError:
5493 Py_DECREF(list);
5494 return NULL;
5495}
5496
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497#undef SPLIT_APPEND
5498
5499static
5500PyObject *split(PyUnicodeObject *self,
5501 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005502 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503{
5504 PyObject *list;
5505
5506 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005507 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508
5509 list = PyList_New(0);
5510 if (!list)
5511 return NULL;
5512
5513 if (substring == NULL)
5514 return split_whitespace(self,list,maxcount);
5515
5516 else if (substring->length == 1)
5517 return split_char(self,list,substring->str[0],maxcount);
5518
5519 else if (substring->length == 0) {
5520 Py_DECREF(list);
5521 PyErr_SetString(PyExc_ValueError, "empty separator");
5522 return NULL;
5523 }
5524 else
5525 return split_substring(self,list,substring,maxcount);
5526}
5527
Tim Petersced69f82003-09-16 20:30:58 +00005528static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005529PyObject *rsplit(PyUnicodeObject *self,
5530 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005531 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005532{
5533 PyObject *list;
5534
5535 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005536 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005537
5538 list = PyList_New(0);
5539 if (!list)
5540 return NULL;
5541
5542 if (substring == NULL)
5543 return rsplit_whitespace(self,list,maxcount);
5544
5545 else if (substring->length == 1)
5546 return rsplit_char(self,list,substring->str[0],maxcount);
5547
5548 else if (substring->length == 0) {
5549 Py_DECREF(list);
5550 PyErr_SetString(PyExc_ValueError, "empty separator");
5551 return NULL;
5552 }
5553 else
5554 return rsplit_substring(self,list,substring,maxcount);
5555}
5556
5557static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558PyObject *replace(PyUnicodeObject *self,
5559 PyUnicodeObject *str1,
5560 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005561 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562{
5563 PyUnicodeObject *u;
5564
5565 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005566 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567
Thomas Wouters477c8d52006-05-27 19:21:47 +00005568 if (str1->length == str2->length) {
5569 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005570 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005571 if (str1->length == 1) {
5572 /* replace characters */
5573 Py_UNICODE u1, u2;
5574 if (!findchar(self->str, self->length, str1->str[0]))
5575 goto nothing;
5576 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5577 if (!u)
5578 return NULL;
5579 Py_UNICODE_COPY(u->str, self->str, self->length);
5580 u1 = str1->str[0];
5581 u2 = str2->str[0];
5582 for (i = 0; i < u->length; i++)
5583 if (u->str[i] == u1) {
5584 if (--maxcount < 0)
5585 break;
5586 u->str[i] = u2;
5587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005589 i = fastsearch(
5590 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005592 if (i < 0)
5593 goto nothing;
5594 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5595 if (!u)
5596 return NULL;
5597 Py_UNICODE_COPY(u->str, self->str, self->length);
5598 while (i <= self->length - str1->length)
5599 if (Py_UNICODE_MATCH(self, i, str1)) {
5600 if (--maxcount < 0)
5601 break;
5602 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5603 i += str1->length;
5604 } else
5605 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005608
5609 Py_ssize_t n, i, j, e;
5610 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 Py_UNICODE *p;
5612
5613 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005614 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 if (n > maxcount)
5616 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005617 if (n == 0)
5618 goto nothing;
5619 /* new_size = self->length + n * (str2->length - str1->length)); */
5620 delta = (str2->length - str1->length);
5621 if (delta == 0) {
5622 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005624 product = n * (str2->length - str1->length);
5625 if ((product / (str2->length - str1->length)) != n) {
5626 PyErr_SetString(PyExc_OverflowError,
5627 "replace string is too long");
5628 return NULL;
5629 }
5630 new_size = self->length + product;
5631 if (new_size < 0) {
5632 PyErr_SetString(PyExc_OverflowError,
5633 "replace string is too long");
5634 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635 }
5636 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005637 u = _PyUnicode_New(new_size);
5638 if (!u)
5639 return NULL;
5640 i = 0;
5641 p = u->str;
5642 e = self->length - str1->length;
5643 if (str1->length > 0) {
5644 while (n-- > 0) {
5645 /* look for next match */
5646 j = i;
5647 while (j <= e) {
5648 if (Py_UNICODE_MATCH(self, j, str1))
5649 break;
5650 j++;
5651 }
5652 if (j > i) {
5653 if (j > e)
5654 break;
5655 /* copy unchanged part [i:j] */
5656 Py_UNICODE_COPY(p, self->str+i, j-i);
5657 p += j - i;
5658 }
5659 /* copy substitution string */
5660 if (str2->length > 0) {
5661 Py_UNICODE_COPY(p, str2->str, str2->length);
5662 p += str2->length;
5663 }
5664 i = j + str1->length;
5665 }
5666 if (i < self->length)
5667 /* copy tail [i:] */
5668 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5669 } else {
5670 /* interleave */
5671 while (n > 0) {
5672 Py_UNICODE_COPY(p, str2->str, str2->length);
5673 p += str2->length;
5674 if (--n <= 0)
5675 break;
5676 *p++ = self->str[i++];
5677 }
5678 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5679 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005682
5683nothing:
5684 /* nothing to replace; return original string (when possible) */
5685 if (PyUnicode_CheckExact(self)) {
5686 Py_INCREF(self);
5687 return (PyObject *) self;
5688 }
5689 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690}
5691
5692/* --- Unicode Object Methods --------------------------------------------- */
5693
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005694PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695"S.title() -> unicode\n\
5696\n\
5697Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005698characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699
5700static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005701unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 return fixup(self, fixtitle);
5704}
5705
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005706PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707"S.capitalize() -> unicode\n\
5708\n\
5709Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005710have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711
5712static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005713unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 return fixup(self, fixcapitalize);
5716}
5717
5718#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005719PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720"S.capwords() -> unicode\n\
5721\n\
5722Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005723normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724
5725static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005726unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727{
5728 PyObject *list;
5729 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005730 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 /* Split into words */
5733 list = split(self, NULL, -1);
5734 if (!list)
5735 return NULL;
5736
5737 /* Capitalize each word */
5738 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5739 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5740 fixcapitalize);
5741 if (item == NULL)
5742 goto onError;
5743 Py_DECREF(PyList_GET_ITEM(list, i));
5744 PyList_SET_ITEM(list, i, item);
5745 }
5746
5747 /* Join the words to form a new string */
5748 item = PyUnicode_Join(NULL, list);
5749
5750onError:
5751 Py_DECREF(list);
5752 return (PyObject *)item;
5753}
5754#endif
5755
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005756/* Argument converter. Coerces to a single unicode character */
5757
5758static int
5759convert_uc(PyObject *obj, void *addr)
5760{
5761 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5762 PyObject *uniobj;
5763 Py_UNICODE *unistr;
5764
5765 uniobj = PyUnicode_FromObject(obj);
5766 if (uniobj == NULL) {
5767 PyErr_SetString(PyExc_TypeError,
5768 "The fill character cannot be converted to Unicode");
5769 return 0;
5770 }
5771 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5772 PyErr_SetString(PyExc_TypeError,
5773 "The fill character must be exactly one character long");
5774 Py_DECREF(uniobj);
5775 return 0;
5776 }
5777 unistr = PyUnicode_AS_UNICODE(uniobj);
5778 *fillcharloc = unistr[0];
5779 Py_DECREF(uniobj);
5780 return 1;
5781}
5782
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005783PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005784"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005786Return S centered in a Unicode string of length width. Padding is\n\
5787done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788
5789static PyObject *
5790unicode_center(PyUnicodeObject *self, PyObject *args)
5791{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005792 Py_ssize_t marg, left;
5793 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005794 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795
Thomas Woutersde017742006-02-16 19:34:37 +00005796 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 return NULL;
5798
Tim Peters7a29bd52001-09-12 03:03:31 +00005799 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 Py_INCREF(self);
5801 return (PyObject*) self;
5802 }
5803
5804 marg = width - self->length;
5805 left = marg / 2 + (marg & width & 1);
5806
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005807 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808}
5809
Marc-André Lemburge5034372000-08-08 08:04:29 +00005810#if 0
5811
5812/* This code should go into some future Unicode collation support
5813 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005814 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005815
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005816/* speedy UTF-16 code point order comparison */
5817/* gleaned from: */
5818/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5819
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005820static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005821{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005822 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005823 0, 0, 0, 0, 0, 0, 0, 0,
5824 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005825 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005826};
5827
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828static int
5829unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5830{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005831 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005832
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833 Py_UNICODE *s1 = str1->str;
5834 Py_UNICODE *s2 = str2->str;
5835
5836 len1 = str1->length;
5837 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005838
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005840 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005841
5842 c1 = *s1++;
5843 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005844
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005845 if (c1 > (1<<11) * 26)
5846 c1 += utf16Fixup[c1>>11];
5847 if (c2 > (1<<11) * 26)
5848 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005849 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005850
5851 if (c1 != c2)
5852 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005853
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005854 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855 }
5856
5857 return (len1 < len2) ? -1 : (len1 != len2);
5858}
5859
Marc-André Lemburge5034372000-08-08 08:04:29 +00005860#else
5861
5862static int
5863unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5864{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005865 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005866
5867 Py_UNICODE *s1 = str1->str;
5868 Py_UNICODE *s2 = str2->str;
5869
5870 len1 = str1->length;
5871 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005872
Marc-André Lemburge5034372000-08-08 08:04:29 +00005873 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005874 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005875
Fredrik Lundh45714e92001-06-26 16:39:36 +00005876 c1 = *s1++;
5877 c2 = *s2++;
5878
5879 if (c1 != c2)
5880 return (c1 < c2) ? -1 : 1;
5881
Marc-André Lemburge5034372000-08-08 08:04:29 +00005882 len1--; len2--;
5883 }
5884
5885 return (len1 < len2) ? -1 : (len1 != len2);
5886}
5887
5888#endif
5889
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890int PyUnicode_Compare(PyObject *left,
5891 PyObject *right)
5892{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005893 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5894 return unicode_compare((PyUnicodeObject *)left,
5895 (PyUnicodeObject *)right);
5896 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5897 (PyUnicode_Check(left) && PyString_Check(right))) {
5898 if (PyUnicode_Check(left))
5899 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5900 if (PyUnicode_Check(right))
5901 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5902 assert(PyString_Check(left));
5903 assert(PyString_Check(right));
5904 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005906 PyErr_Format(PyExc_TypeError,
5907 "Can't compare %.100s and %.100s",
5908 left->ob_type->tp_name,
5909 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 return -1;
5911}
5912
Martin v. Löwis5b222132007-06-10 09:51:05 +00005913int
5914PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
5915{
5916 int i;
5917 Py_UNICODE *id;
5918 assert(PyUnicode_Check(uni));
5919 id = PyUnicode_AS_UNICODE(uni);
5920 /* Compare Unicode string and source character set string */
5921 for (i = 0; id[i] && str[i]; i++)
5922 if (id[i] != str[i])
5923 return ((int)id[i] < (int)str[i]) ? -1 : 1;
5924 if (id[i])
5925 return 1; /* uni is longer */
5926 if (str[i])
5927 return -1; /* str is longer */
5928 return 0;
5929}
5930
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005931PyObject *PyUnicode_RichCompare(PyObject *left,
5932 PyObject *right,
5933 int op)
5934{
5935 int result;
5936
5937 result = PyUnicode_Compare(left, right);
5938 if (result == -1 && PyErr_Occurred())
5939 goto onError;
5940
5941 /* Convert the return value to a Boolean */
5942 switch (op) {
5943 case Py_EQ:
5944 result = (result == 0);
5945 break;
5946 case Py_NE:
5947 result = (result != 0);
5948 break;
5949 case Py_LE:
5950 result = (result <= 0);
5951 break;
5952 case Py_GE:
5953 result = (result >= 0);
5954 break;
5955 case Py_LT:
5956 result = (result == -1);
5957 break;
5958 case Py_GT:
5959 result = (result == 1);
5960 break;
5961 }
5962 return PyBool_FromLong(result);
5963
5964 onError:
5965
5966 /* Standard case
5967
5968 Type errors mean that PyUnicode_FromObject() could not convert
5969 one of the arguments (usually the right hand side) to Unicode,
5970 ie. we can't handle the comparison request. However, it is
5971 possible that the other object knows a comparison method, which
5972 is why we return Py_NotImplemented to give the other object a
5973 chance.
5974
5975 */
5976 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5977 PyErr_Clear();
5978 Py_INCREF(Py_NotImplemented);
5979 return Py_NotImplemented;
5980 }
5981 if (op != Py_EQ && op != Py_NE)
5982 return NULL;
5983
5984 /* Equality comparison.
5985
5986 This is a special case: we silence any PyExc_UnicodeDecodeError
5987 and instead turn it into a PyErr_UnicodeWarning.
5988
5989 */
5990 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5991 return NULL;
5992 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00005993 if (PyErr_WarnEx(PyExc_UnicodeWarning,
5994 (op == Py_EQ) ?
5995 "Unicode equal comparison "
5996 "failed to convert both arguments to Unicode - "
5997 "interpreting them as being unequal"
5998 :
5999 "Unicode unequal comparison "
6000 "failed to convert both arguments to Unicode - "
6001 "interpreting them as being unequal",
6002 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006003 return NULL;
6004 result = (op == Py_NE);
6005 return PyBool_FromLong(result);
6006}
6007
Guido van Rossum403d68b2000-03-13 15:55:09 +00006008int PyUnicode_Contains(PyObject *container,
6009 PyObject *element)
6010{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006011 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006012 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006013
6014 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006015 sub = PyUnicode_FromObject(element);
6016 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006017 PyErr_Format(PyExc_TypeError,
6018 "'in <string>' requires string as left operand, not %s",
6019 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006020 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006021 }
6022
Thomas Wouters477c8d52006-05-27 19:21:47 +00006023 str = PyUnicode_FromObject(container);
6024 if (!str) {
6025 Py_DECREF(sub);
6026 return -1;
6027 }
6028
6029 result = stringlib_contains_obj(str, sub);
6030
6031 Py_DECREF(str);
6032 Py_DECREF(sub);
6033
Guido van Rossum403d68b2000-03-13 15:55:09 +00006034 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006035}
6036
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037/* Concat to string or Unicode object giving a new Unicode object. */
6038
6039PyObject *PyUnicode_Concat(PyObject *left,
6040 PyObject *right)
6041{
6042 PyUnicodeObject *u = NULL, *v = NULL, *w;
6043
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006044 if (PyBytes_Check(left) || PyBytes_Check(right))
6045 return PyBytes_Concat(left, right);
6046
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 /* Coerce the two arguments */
6048 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6049 if (u == NULL)
6050 goto onError;
6051 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6052 if (v == NULL)
6053 goto onError;
6054
6055 /* Shortcuts */
6056 if (v == unicode_empty) {
6057 Py_DECREF(v);
6058 return (PyObject *)u;
6059 }
6060 if (u == unicode_empty) {
6061 Py_DECREF(u);
6062 return (PyObject *)v;
6063 }
6064
6065 /* Concat the two Unicode strings */
6066 w = _PyUnicode_New(u->length + v->length);
6067 if (w == NULL)
6068 goto onError;
6069 Py_UNICODE_COPY(w->str, u->str, u->length);
6070 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6071
6072 Py_DECREF(u);
6073 Py_DECREF(v);
6074 return (PyObject *)w;
6075
6076onError:
6077 Py_XDECREF(u);
6078 Py_XDECREF(v);
6079 return NULL;
6080}
6081
Walter Dörwald1ab83302007-05-18 17:15:44 +00006082void
6083PyUnicode_Append(PyObject **pleft, PyObject *right)
6084{
6085 PyObject *new;
6086 if (*pleft == NULL)
6087 return;
6088 if (right == NULL || !PyUnicode_Check(*pleft)) {
6089 Py_DECREF(*pleft);
6090 *pleft = NULL;
6091 return;
6092 }
6093 new = PyUnicode_Concat(*pleft, right);
6094 Py_DECREF(*pleft);
6095 *pleft = new;
6096}
6097
6098void
6099PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6100{
6101 PyUnicode_Append(pleft, right);
6102 Py_XDECREF(right);
6103}
6104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006105PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106"S.count(sub[, start[, end]]) -> int\n\
6107\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006108Return the number of non-overlapping occurrences of substring sub in\n\
6109Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006110interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111
6112static PyObject *
6113unicode_count(PyUnicodeObject *self, PyObject *args)
6114{
6115 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006116 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006117 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 PyObject *result;
6119
Guido van Rossumb8872e62000-05-09 14:14:27 +00006120 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6121 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 return NULL;
6123
6124 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006125 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 if (substring == NULL)
6127 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006128
Thomas Wouters477c8d52006-05-27 19:21:47 +00006129 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130
Thomas Wouters477c8d52006-05-27 19:21:47 +00006131 result = PyInt_FromSsize_t(
6132 stringlib_count(self->str + start, end - start,
6133 substring->str, substring->length)
6134 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135
6136 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006137
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 return result;
6139}
6140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006141PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006142"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006144Encodes S using the codec registered for encoding. encoding defaults\n\
6145to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006146handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6148'xmlcharrefreplace' as well as any other name registered with\n\
6149codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150
6151static PyObject *
6152unicode_encode(PyUnicodeObject *self, PyObject *args)
6153{
6154 char *encoding = NULL;
6155 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006156 PyObject *v;
6157
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6159 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006160 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006161 if (v == NULL)
6162 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006163 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006164 if (PyString_Check(v)) {
6165 /* Old codec, turn it into bytes */
6166 PyObject *b = PyBytes_FromObject(v);
6167 Py_DECREF(v);
6168 return b;
6169 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006170 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006171 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006172 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006173 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006174 Py_DECREF(v);
6175 return NULL;
6176 }
6177 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006178
6179 onError:
6180 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006181}
6182
6183PyDoc_STRVAR(decode__doc__,
6184"S.decode([encoding[,errors]]) -> string or unicode\n\
6185\n\
6186Decodes S using the codec registered for encoding. encoding defaults\n\
6187to the default encoding. errors may be given to set a different error\n\
6188handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6189a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6190as well as any other name registerd with codecs.register_error that is\n\
6191able to handle UnicodeDecodeErrors.");
6192
6193static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006194unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006195{
6196 char *encoding = NULL;
6197 char *errors = NULL;
6198 PyObject *v;
6199
6200 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6201 return NULL;
6202 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006203 if (v == NULL)
6204 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006205 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6206 PyErr_Format(PyExc_TypeError,
6207 "decoder did not return a string/unicode object "
6208 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006209 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006210 Py_DECREF(v);
6211 return NULL;
6212 }
6213 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006214
6215 onError:
6216 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217}
6218
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006219PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220"S.expandtabs([tabsize]) -> unicode\n\
6221\n\
6222Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006223If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224
6225static PyObject*
6226unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6227{
6228 Py_UNICODE *e;
6229 Py_UNICODE *p;
6230 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006231 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 PyUnicodeObject *u;
6233 int tabsize = 8;
6234
6235 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6236 return NULL;
6237
Thomas Wouters7e474022000-07-16 12:04:32 +00006238 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006239 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 e = self->str + self->length;
6241 for (p = self->str; p < e; p++)
6242 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006243 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006245 if (old_j > j) {
6246 PyErr_SetString(PyExc_OverflowError,
6247 "new string is too long");
6248 return NULL;
6249 }
6250 old_j = j;
6251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252 }
6253 else {
6254 j++;
6255 if (*p == '\n' || *p == '\r') {
6256 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006257 old_j = j = 0;
6258 if (i < 0) {
6259 PyErr_SetString(PyExc_OverflowError,
6260 "new string is too long");
6261 return NULL;
6262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 }
6264 }
6265
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006266 if ((i + j) < 0) {
6267 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6268 return NULL;
6269 }
6270
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 /* Second pass: create output string and fill it */
6272 u = _PyUnicode_New(i + j);
6273 if (!u)
6274 return NULL;
6275
6276 j = 0;
6277 q = u->str;
6278
6279 for (p = self->str; p < e; p++)
6280 if (*p == '\t') {
6281 if (tabsize > 0) {
6282 i = tabsize - (j % tabsize);
6283 j += i;
6284 while (i--)
6285 *q++ = ' ';
6286 }
6287 }
6288 else {
6289 j++;
6290 *q++ = *p;
6291 if (*p == '\n' || *p == '\r')
6292 j = 0;
6293 }
6294
6295 return (PyObject*) u;
6296}
6297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006298PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299"S.find(sub [,start [,end]]) -> int\n\
6300\n\
6301Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006302such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303arguments start and end are interpreted as in slice notation.\n\
6304\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006305Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306
6307static PyObject *
6308unicode_find(PyUnicodeObject *self, PyObject *args)
6309{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006310 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006311 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006312 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006313 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314
Guido van Rossumb8872e62000-05-09 14:14:27 +00006315 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6316 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006318 substring = PyUnicode_FromObject(substring);
6319 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320 return NULL;
6321
Thomas Wouters477c8d52006-05-27 19:21:47 +00006322 result = stringlib_find_slice(
6323 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6324 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6325 start, end
6326 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327
6328 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006329
6330 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331}
6332
6333static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006334unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335{
6336 if (index < 0 || index >= self->length) {
6337 PyErr_SetString(PyExc_IndexError, "string index out of range");
6338 return NULL;
6339 }
6340
6341 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6342}
6343
6344static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006345unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006347 /* Since Unicode objects compare equal to their UTF-8 string
6348 counterparts, we hash the UTF-8 string. */
6349 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6350 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351}
6352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006353PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354"S.index(sub [,start [,end]]) -> int\n\
6355\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006356Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357
6358static PyObject *
6359unicode_index(PyUnicodeObject *self, PyObject *args)
6360{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006361 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006362 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006363 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006364 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365
Guido van Rossumb8872e62000-05-09 14:14:27 +00006366 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6367 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006369 substring = PyUnicode_FromObject(substring);
6370 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 return NULL;
6372
Thomas Wouters477c8d52006-05-27 19:21:47 +00006373 result = stringlib_find_slice(
6374 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6375 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6376 start, end
6377 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378
6379 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006380
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 if (result < 0) {
6382 PyErr_SetString(PyExc_ValueError, "substring not found");
6383 return NULL;
6384 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006385
Martin v. Löwis18e16552006-02-15 17:27:45 +00006386 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387}
6388
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006389PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006390"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006392Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006393at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394
6395static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006396unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397{
6398 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6399 register const Py_UNICODE *e;
6400 int cased;
6401
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402 /* Shortcut for single character strings */
6403 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006404 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006406 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006407 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006408 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006409
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 e = p + PyUnicode_GET_SIZE(self);
6411 cased = 0;
6412 for (; p < e; p++) {
6413 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006414
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006416 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417 else if (!cased && Py_UNICODE_ISLOWER(ch))
6418 cased = 1;
6419 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006420 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421}
6422
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006423PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006424"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006426Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006427at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428
6429static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006430unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431{
6432 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6433 register const Py_UNICODE *e;
6434 int cased;
6435
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 /* Shortcut for single character strings */
6437 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006438 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006440 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006441 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006442 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006443
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 e = p + PyUnicode_GET_SIZE(self);
6445 cased = 0;
6446 for (; p < e; p++) {
6447 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006448
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006450 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 else if (!cased && Py_UNICODE_ISUPPER(ch))
6452 cased = 1;
6453 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006454 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455}
6456
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006457PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006458"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006460Return True if S is a titlecased string and there is at least one\n\
6461character in S, i.e. upper- and titlecase characters may only\n\
6462follow uncased characters and lowercase characters only cased ones.\n\
6463Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464
6465static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006466unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467{
6468 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6469 register const Py_UNICODE *e;
6470 int cased, previous_is_cased;
6471
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 /* Shortcut for single character strings */
6473 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006474 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6475 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006477 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006478 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006479 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006480
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 e = p + PyUnicode_GET_SIZE(self);
6482 cased = 0;
6483 previous_is_cased = 0;
6484 for (; p < e; p++) {
6485 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006486
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6488 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006489 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 previous_is_cased = 1;
6491 cased = 1;
6492 }
6493 else if (Py_UNICODE_ISLOWER(ch)) {
6494 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006495 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 previous_is_cased = 1;
6497 cased = 1;
6498 }
6499 else
6500 previous_is_cased = 0;
6501 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006502 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503}
6504
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006505PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006506"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006508Return True if all characters in S are whitespace\n\
6509and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510
6511static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006512unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513{
6514 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6515 register const Py_UNICODE *e;
6516
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 /* Shortcut for single character strings */
6518 if (PyUnicode_GET_SIZE(self) == 1 &&
6519 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006520 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006522 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006523 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006524 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006525
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 e = p + PyUnicode_GET_SIZE(self);
6527 for (; p < e; p++) {
6528 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006529 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006531 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532}
6533
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006534PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006535"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006536\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006537Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006538and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006539
6540static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006541unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006542{
6543 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6544 register const Py_UNICODE *e;
6545
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006546 /* Shortcut for single character strings */
6547 if (PyUnicode_GET_SIZE(self) == 1 &&
6548 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006549 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006550
6551 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006552 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006553 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006554
6555 e = p + PyUnicode_GET_SIZE(self);
6556 for (; p < e; p++) {
6557 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006558 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006559 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006560 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006561}
6562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006563PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006564"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006565\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006566Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006567and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006568
6569static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006570unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006571{
6572 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6573 register const Py_UNICODE *e;
6574
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006575 /* Shortcut for single character strings */
6576 if (PyUnicode_GET_SIZE(self) == 1 &&
6577 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006578 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006579
6580 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006581 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006582 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006583
6584 e = p + PyUnicode_GET_SIZE(self);
6585 for (; p < e; p++) {
6586 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006587 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006588 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006589 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006590}
6591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006592PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006593"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006595Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006596False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597
6598static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006599unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600{
6601 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6602 register const Py_UNICODE *e;
6603
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604 /* Shortcut for single character strings */
6605 if (PyUnicode_GET_SIZE(self) == 1 &&
6606 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006607 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006609 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006610 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006611 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006612
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 e = p + PyUnicode_GET_SIZE(self);
6614 for (; p < e; p++) {
6615 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006616 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006618 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619}
6620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006621PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006622"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006624Return True if all characters in S are digits\n\
6625and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626
6627static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006628unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629{
6630 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6631 register const Py_UNICODE *e;
6632
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 /* Shortcut for single character strings */
6634 if (PyUnicode_GET_SIZE(self) == 1 &&
6635 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006636 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006638 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006639 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006640 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006641
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 e = p + PyUnicode_GET_SIZE(self);
6643 for (; p < e; p++) {
6644 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006645 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006647 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648}
6649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006650PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006651"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006653Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006654False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655
6656static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006657unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658{
6659 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6660 register const Py_UNICODE *e;
6661
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 /* Shortcut for single character strings */
6663 if (PyUnicode_GET_SIZE(self) == 1 &&
6664 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006665 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006667 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006668 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006669 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006670
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 e = p + PyUnicode_GET_SIZE(self);
6672 for (; p < e; p++) {
6673 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006674 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006676 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677}
6678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006679PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680"S.join(sequence) -> unicode\n\
6681\n\
6682Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006683sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684
6685static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006686unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006688 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689}
6690
Martin v. Löwis18e16552006-02-15 17:27:45 +00006691static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692unicode_length(PyUnicodeObject *self)
6693{
6694 return self->length;
6695}
6696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006697PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006698"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699\n\
6700Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006701done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702
6703static PyObject *
6704unicode_ljust(PyUnicodeObject *self, PyObject *args)
6705{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006706 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006707 Py_UNICODE fillchar = ' ';
6708
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006709 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710 return NULL;
6711
Tim Peters7a29bd52001-09-12 03:03:31 +00006712 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 Py_INCREF(self);
6714 return (PyObject*) self;
6715 }
6716
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006717 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718}
6719
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006720PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721"S.lower() -> unicode\n\
6722\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006723Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724
6725static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006726unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728 return fixup(self, fixlower);
6729}
6730
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006731#define LEFTSTRIP 0
6732#define RIGHTSTRIP 1
6733#define BOTHSTRIP 2
6734
6735/* Arrays indexed by above */
6736static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6737
6738#define STRIPNAME(i) (stripformat[i]+3)
6739
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006740/* externally visible for str.strip(unicode) */
6741PyObject *
6742_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6743{
6744 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006745 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006746 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006747 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6748 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006749
Thomas Wouters477c8d52006-05-27 19:21:47 +00006750 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6751
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006752 i = 0;
6753 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006754 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6755 i++;
6756 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006757 }
6758
6759 j = len;
6760 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006761 do {
6762 j--;
6763 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6764 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006765 }
6766
6767 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006768 Py_INCREF(self);
6769 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006770 }
6771 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006772 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006773}
6774
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775
6776static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006777do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006779 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006780 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006781
6782 i = 0;
6783 if (striptype != RIGHTSTRIP) {
6784 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6785 i++;
6786 }
6787 }
6788
6789 j = len;
6790 if (striptype != LEFTSTRIP) {
6791 do {
6792 j--;
6793 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6794 j++;
6795 }
6796
6797 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6798 Py_INCREF(self);
6799 return (PyObject*)self;
6800 }
6801 else
6802 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803}
6804
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006805
6806static PyObject *
6807do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6808{
6809 PyObject *sep = NULL;
6810
6811 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6812 return NULL;
6813
6814 if (sep != NULL && sep != Py_None) {
6815 if (PyUnicode_Check(sep))
6816 return _PyUnicode_XStrip(self, striptype, sep);
6817 else if (PyString_Check(sep)) {
6818 PyObject *res;
6819 sep = PyUnicode_FromObject(sep);
6820 if (sep==NULL)
6821 return NULL;
6822 res = _PyUnicode_XStrip(self, striptype, sep);
6823 Py_DECREF(sep);
6824 return res;
6825 }
6826 else {
6827 PyErr_Format(PyExc_TypeError,
6828 "%s arg must be None, unicode or str",
6829 STRIPNAME(striptype));
6830 return NULL;
6831 }
6832 }
6833
6834 return do_strip(self, striptype);
6835}
6836
6837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006838PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006839"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006840\n\
6841Return a copy of the string S with leading and trailing\n\
6842whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006843If chars is given and not None, remove characters in chars instead.\n\
6844If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006845
6846static PyObject *
6847unicode_strip(PyUnicodeObject *self, PyObject *args)
6848{
6849 if (PyTuple_GET_SIZE(args) == 0)
6850 return do_strip(self, BOTHSTRIP); /* Common case */
6851 else
6852 return do_argstrip(self, BOTHSTRIP, args);
6853}
6854
6855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006856PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006857"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006858\n\
6859Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006860If chars is given and not None, remove characters in chars instead.\n\
6861If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006862
6863static PyObject *
6864unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6865{
6866 if (PyTuple_GET_SIZE(args) == 0)
6867 return do_strip(self, LEFTSTRIP); /* Common case */
6868 else
6869 return do_argstrip(self, LEFTSTRIP, args);
6870}
6871
6872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006873PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006874"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006875\n\
6876Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006877If chars is given and not None, remove characters in chars instead.\n\
6878If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006879
6880static PyObject *
6881unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6882{
6883 if (PyTuple_GET_SIZE(args) == 0)
6884 return do_strip(self, RIGHTSTRIP); /* Common case */
6885 else
6886 return do_argstrip(self, RIGHTSTRIP, args);
6887}
6888
6889
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006891unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892{
6893 PyUnicodeObject *u;
6894 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006895 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006896 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897
6898 if (len < 0)
6899 len = 0;
6900
Tim Peters7a29bd52001-09-12 03:03:31 +00006901 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902 /* no repeat, return original string */
6903 Py_INCREF(str);
6904 return (PyObject*) str;
6905 }
Tim Peters8f422462000-09-09 06:13:41 +00006906
6907 /* ensure # of chars needed doesn't overflow int and # of bytes
6908 * needed doesn't overflow size_t
6909 */
6910 nchars = len * str->length;
6911 if (len && nchars / len != str->length) {
6912 PyErr_SetString(PyExc_OverflowError,
6913 "repeated string is too long");
6914 return NULL;
6915 }
6916 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6917 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6918 PyErr_SetString(PyExc_OverflowError,
6919 "repeated string is too long");
6920 return NULL;
6921 }
6922 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923 if (!u)
6924 return NULL;
6925
6926 p = u->str;
6927
Thomas Wouters477c8d52006-05-27 19:21:47 +00006928 if (str->length == 1 && len > 0) {
6929 Py_UNICODE_FILL(p, str->str[0], len);
6930 } else {
6931 Py_ssize_t done = 0; /* number of characters copied this far */
6932 if (done < nchars) {
6933 Py_UNICODE_COPY(p, str->str, str->length);
6934 done = str->length;
6935 }
6936 while (done < nchars) {
6937 int n = (done <= nchars-done) ? done : nchars-done;
6938 Py_UNICODE_COPY(p+done, p, n);
6939 done += n;
6940 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 }
6942
6943 return (PyObject*) u;
6944}
6945
6946PyObject *PyUnicode_Replace(PyObject *obj,
6947 PyObject *subobj,
6948 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006949 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950{
6951 PyObject *self;
6952 PyObject *str1;
6953 PyObject *str2;
6954 PyObject *result;
6955
6956 self = PyUnicode_FromObject(obj);
6957 if (self == NULL)
6958 return NULL;
6959 str1 = PyUnicode_FromObject(subobj);
6960 if (str1 == NULL) {
6961 Py_DECREF(self);
6962 return NULL;
6963 }
6964 str2 = PyUnicode_FromObject(replobj);
6965 if (str2 == NULL) {
6966 Py_DECREF(self);
6967 Py_DECREF(str1);
6968 return NULL;
6969 }
Tim Petersced69f82003-09-16 20:30:58 +00006970 result = replace((PyUnicodeObject *)self,
6971 (PyUnicodeObject *)str1,
6972 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973 maxcount);
6974 Py_DECREF(self);
6975 Py_DECREF(str1);
6976 Py_DECREF(str2);
6977 return result;
6978}
6979
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006980PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981"S.replace (old, new[, maxsplit]) -> unicode\n\
6982\n\
6983Return a copy of S with all occurrences of substring\n\
6984old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006985given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986
6987static PyObject*
6988unicode_replace(PyUnicodeObject *self, PyObject *args)
6989{
6990 PyUnicodeObject *str1;
6991 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006992 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993 PyObject *result;
6994
Martin v. Löwis18e16552006-02-15 17:27:45 +00006995 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996 return NULL;
6997 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6998 if (str1 == NULL)
6999 return NULL;
7000 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007001 if (str2 == NULL) {
7002 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005
7006 result = replace(self, str1, str2, maxcount);
7007
7008 Py_DECREF(str1);
7009 Py_DECREF(str2);
7010 return result;
7011}
7012
7013static
7014PyObject *unicode_repr(PyObject *unicode)
7015{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007016 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007017 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007018 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7019 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7020
7021 /* XXX(nnorwitz): rather than over-allocating, it would be
7022 better to choose a different scheme. Perhaps scan the
7023 first N-chars of the string and allocate based on that size.
7024 */
7025 /* Initial allocation is based on the longest-possible unichr
7026 escape.
7027
7028 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7029 unichr, so in this case it's the longest unichr escape. In
7030 narrow (UTF-16) builds this is five chars per source unichr
7031 since there are two unichrs in the surrogate pair, so in narrow
7032 (UTF-16) builds it's not the longest unichr escape.
7033
7034 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7035 so in the narrow (UTF-16) build case it's the longest unichr
7036 escape.
7037 */
7038
Walter Dörwald1ab83302007-05-18 17:15:44 +00007039 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007040 2 /* quotes */
7041#ifdef Py_UNICODE_WIDE
7042 + 10*size
7043#else
7044 + 6*size
7045#endif
7046 + 1);
7047 if (repr == NULL)
7048 return NULL;
7049
Walter Dörwald1ab83302007-05-18 17:15:44 +00007050 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007051
7052 /* Add quote */
7053 *p++ = (findchar(s, size, '\'') &&
7054 !findchar(s, size, '"')) ? '"' : '\'';
7055 while (size-- > 0) {
7056 Py_UNICODE ch = *s++;
7057
7058 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007059 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007060 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007061 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007062 continue;
7063 }
7064
7065#ifdef Py_UNICODE_WIDE
7066 /* Map 21-bit characters to '\U00xxxxxx' */
7067 else if (ch >= 0x10000) {
7068 *p++ = '\\';
7069 *p++ = 'U';
7070 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7071 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7072 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7073 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7074 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7075 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7076 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7077 *p++ = hexdigits[ch & 0x0000000F];
7078 continue;
7079 }
7080#else
7081 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7082 else if (ch >= 0xD800 && ch < 0xDC00) {
7083 Py_UNICODE ch2;
7084 Py_UCS4 ucs;
7085
7086 ch2 = *s++;
7087 size--;
7088 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7089 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7090 *p++ = '\\';
7091 *p++ = 'U';
7092 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7093 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7094 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7095 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7096 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7097 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7098 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7099 *p++ = hexdigits[ucs & 0x0000000F];
7100 continue;
7101 }
7102 /* Fall through: isolated surrogates are copied as-is */
7103 s--;
7104 size++;
7105 }
7106#endif
7107
7108 /* Map 16-bit characters to '\uxxxx' */
7109 if (ch >= 256) {
7110 *p++ = '\\';
7111 *p++ = 'u';
7112 *p++ = hexdigits[(ch >> 12) & 0x000F];
7113 *p++ = hexdigits[(ch >> 8) & 0x000F];
7114 *p++ = hexdigits[(ch >> 4) & 0x000F];
7115 *p++ = hexdigits[ch & 0x000F];
7116 }
7117
7118 /* Map special whitespace to '\t', \n', '\r' */
7119 else if (ch == '\t') {
7120 *p++ = '\\';
7121 *p++ = 't';
7122 }
7123 else if (ch == '\n') {
7124 *p++ = '\\';
7125 *p++ = 'n';
7126 }
7127 else if (ch == '\r') {
7128 *p++ = '\\';
7129 *p++ = 'r';
7130 }
7131
7132 /* Map non-printable US ASCII to '\xhh' */
7133 else if (ch < ' ' || ch >= 0x7F) {
7134 *p++ = '\\';
7135 *p++ = 'x';
7136 *p++ = hexdigits[(ch >> 4) & 0x000F];
7137 *p++ = hexdigits[ch & 0x000F];
7138 }
7139
7140 /* Copy everything else as-is */
7141 else
7142 *p++ = (char) ch;
7143 }
7144 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007145 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007146
7147 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007148 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007149 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150}
7151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007152PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153"S.rfind(sub [,start [,end]]) -> int\n\
7154\n\
7155Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007156such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157arguments start and end are interpreted as in slice notation.\n\
7158\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007159Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160
7161static PyObject *
7162unicode_rfind(PyUnicodeObject *self, PyObject *args)
7163{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007164 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007165 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007166 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007167 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168
Guido van Rossumb8872e62000-05-09 14:14:27 +00007169 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7170 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007172 substring = PyUnicode_FromObject(substring);
7173 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174 return NULL;
7175
Thomas Wouters477c8d52006-05-27 19:21:47 +00007176 result = stringlib_rfind_slice(
7177 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7178 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7179 start, end
7180 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181
7182 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007183
7184 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185}
7186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007187PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188"S.rindex(sub [,start [,end]]) -> int\n\
7189\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007190Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191
7192static PyObject *
7193unicode_rindex(PyUnicodeObject *self, PyObject *args)
7194{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007195 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007196 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007197 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007198 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199
Guido van Rossumb8872e62000-05-09 14:14:27 +00007200 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7201 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007203 substring = PyUnicode_FromObject(substring);
7204 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 return NULL;
7206
Thomas Wouters477c8d52006-05-27 19:21:47 +00007207 result = stringlib_rfind_slice(
7208 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7209 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7210 start, end
7211 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212
7213 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007214
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 if (result < 0) {
7216 PyErr_SetString(PyExc_ValueError, "substring not found");
7217 return NULL;
7218 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007219 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220}
7221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007222PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007223"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224\n\
7225Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007226done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227
7228static PyObject *
7229unicode_rjust(PyUnicodeObject *self, PyObject *args)
7230{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007231 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007232 Py_UNICODE fillchar = ' ';
7233
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007234 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235 return NULL;
7236
Tim Peters7a29bd52001-09-12 03:03:31 +00007237 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238 Py_INCREF(self);
7239 return (PyObject*) self;
7240 }
7241
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007242 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243}
7244
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007246unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247{
7248 /* standard clamping */
7249 if (start < 0)
7250 start = 0;
7251 if (end < 0)
7252 end = 0;
7253 if (end > self->length)
7254 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007255 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256 /* full slice, return original string */
7257 Py_INCREF(self);
7258 return (PyObject*) self;
7259 }
7260 if (start > end)
7261 start = end;
7262 /* copy slice */
7263 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7264 end - start);
7265}
7266
7267PyObject *PyUnicode_Split(PyObject *s,
7268 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007269 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270{
7271 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007272
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273 s = PyUnicode_FromObject(s);
7274 if (s == NULL)
7275 return NULL;
7276 if (sep != NULL) {
7277 sep = PyUnicode_FromObject(sep);
7278 if (sep == NULL) {
7279 Py_DECREF(s);
7280 return NULL;
7281 }
7282 }
7283
7284 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7285
7286 Py_DECREF(s);
7287 Py_XDECREF(sep);
7288 return result;
7289}
7290
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007291PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292"S.split([sep [,maxsplit]]) -> list of strings\n\
7293\n\
7294Return a list of the words in S, using sep as the\n\
7295delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007296splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007297any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298
7299static PyObject*
7300unicode_split(PyUnicodeObject *self, PyObject *args)
7301{
7302 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007303 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304
Martin v. Löwis18e16552006-02-15 17:27:45 +00007305 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306 return NULL;
7307
7308 if (substring == Py_None)
7309 return split(self, NULL, maxcount);
7310 else if (PyUnicode_Check(substring))
7311 return split(self, (PyUnicodeObject *)substring, maxcount);
7312 else
7313 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7314}
7315
Thomas Wouters477c8d52006-05-27 19:21:47 +00007316PyObject *
7317PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7318{
7319 PyObject* str_obj;
7320 PyObject* sep_obj;
7321 PyObject* out;
7322
7323 str_obj = PyUnicode_FromObject(str_in);
7324 if (!str_obj)
7325 return NULL;
7326 sep_obj = PyUnicode_FromObject(sep_in);
7327 if (!sep_obj) {
7328 Py_DECREF(str_obj);
7329 return NULL;
7330 }
7331
7332 out = stringlib_partition(
7333 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7334 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7335 );
7336
7337 Py_DECREF(sep_obj);
7338 Py_DECREF(str_obj);
7339
7340 return out;
7341}
7342
7343
7344PyObject *
7345PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7346{
7347 PyObject* str_obj;
7348 PyObject* sep_obj;
7349 PyObject* out;
7350
7351 str_obj = PyUnicode_FromObject(str_in);
7352 if (!str_obj)
7353 return NULL;
7354 sep_obj = PyUnicode_FromObject(sep_in);
7355 if (!sep_obj) {
7356 Py_DECREF(str_obj);
7357 return NULL;
7358 }
7359
7360 out = stringlib_rpartition(
7361 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7362 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7363 );
7364
7365 Py_DECREF(sep_obj);
7366 Py_DECREF(str_obj);
7367
7368 return out;
7369}
7370
7371PyDoc_STRVAR(partition__doc__,
7372"S.partition(sep) -> (head, sep, tail)\n\
7373\n\
7374Searches for the separator sep in S, and returns the part before it,\n\
7375the separator itself, and the part after it. If the separator is not\n\
7376found, returns S and two empty strings.");
7377
7378static PyObject*
7379unicode_partition(PyUnicodeObject *self, PyObject *separator)
7380{
7381 return PyUnicode_Partition((PyObject *)self, separator);
7382}
7383
7384PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007385"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007386\n\
7387Searches for the separator sep in S, starting at the end of S, and returns\n\
7388the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007389separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007390
7391static PyObject*
7392unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7393{
7394 return PyUnicode_RPartition((PyObject *)self, separator);
7395}
7396
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007397PyObject *PyUnicode_RSplit(PyObject *s,
7398 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007399 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007400{
7401 PyObject *result;
7402
7403 s = PyUnicode_FromObject(s);
7404 if (s == NULL)
7405 return NULL;
7406 if (sep != NULL) {
7407 sep = PyUnicode_FromObject(sep);
7408 if (sep == NULL) {
7409 Py_DECREF(s);
7410 return NULL;
7411 }
7412 }
7413
7414 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7415
7416 Py_DECREF(s);
7417 Py_XDECREF(sep);
7418 return result;
7419}
7420
7421PyDoc_STRVAR(rsplit__doc__,
7422"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7423\n\
7424Return a list of the words in S, using sep as the\n\
7425delimiter string, starting at the end of the string and\n\
7426working to the front. If maxsplit is given, at most maxsplit\n\
7427splits are done. If sep is not specified, any whitespace string\n\
7428is a separator.");
7429
7430static PyObject*
7431unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7432{
7433 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007434 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007435
Martin v. Löwis18e16552006-02-15 17:27:45 +00007436 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007437 return NULL;
7438
7439 if (substring == Py_None)
7440 return rsplit(self, NULL, maxcount);
7441 else if (PyUnicode_Check(substring))
7442 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7443 else
7444 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7445}
7446
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007447PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007448"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449\n\
7450Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007451Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007452is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453
7454static PyObject*
7455unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7456{
Guido van Rossum86662912000-04-11 15:38:46 +00007457 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458
Guido van Rossum86662912000-04-11 15:38:46 +00007459 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460 return NULL;
7461
Guido van Rossum86662912000-04-11 15:38:46 +00007462 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463}
7464
7465static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007466PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467{
Walter Dörwald346737f2007-05-31 10:44:43 +00007468 if (PyUnicode_CheckExact(self)) {
7469 Py_INCREF(self);
7470 return self;
7471 } else
7472 /* Subtype -- return genuine unicode string with the same value. */
7473 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7474 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475}
7476
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007477PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478"S.swapcase() -> unicode\n\
7479\n\
7480Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007481and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482
7483static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007484unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486 return fixup(self, fixswapcase);
7487}
7488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007489PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490"S.translate(table) -> unicode\n\
7491\n\
7492Return a copy of the string S, where all characters have been mapped\n\
7493through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007494Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7495Unmapped characters are left untouched. Characters mapped to None\n\
7496are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497
7498static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007499unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500{
Tim Petersced69f82003-09-16 20:30:58 +00007501 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007503 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504 "ignore");
7505}
7506
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007507PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508"S.upper() -> unicode\n\
7509\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007510Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511
7512static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007513unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515 return fixup(self, fixupper);
7516}
7517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007518PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519"S.zfill(width) -> unicode\n\
7520\n\
7521Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007522of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523
7524static PyObject *
7525unicode_zfill(PyUnicodeObject *self, PyObject *args)
7526{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007527 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528 PyUnicodeObject *u;
7529
Martin v. Löwis18e16552006-02-15 17:27:45 +00007530 Py_ssize_t width;
7531 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 return NULL;
7533
7534 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007535 if (PyUnicode_CheckExact(self)) {
7536 Py_INCREF(self);
7537 return (PyObject*) self;
7538 }
7539 else
7540 return PyUnicode_FromUnicode(
7541 PyUnicode_AS_UNICODE(self),
7542 PyUnicode_GET_SIZE(self)
7543 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544 }
7545
7546 fill = width - self->length;
7547
7548 u = pad(self, fill, 0, '0');
7549
Walter Dörwald068325e2002-04-15 13:36:47 +00007550 if (u == NULL)
7551 return NULL;
7552
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553 if (u->str[fill] == '+' || u->str[fill] == '-') {
7554 /* move sign to beginning of string */
7555 u->str[0] = u->str[fill];
7556 u->str[fill] = '0';
7557 }
7558
7559 return (PyObject*) u;
7560}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561
7562#if 0
7563static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007564unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566 return PyInt_FromLong(unicode_freelist_size);
7567}
7568#endif
7569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007570PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007571"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007573Return True if S starts with the specified prefix, False otherwise.\n\
7574With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007575With optional end, stop comparing S at that position.\n\
7576prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577
7578static PyObject *
7579unicode_startswith(PyUnicodeObject *self,
7580 PyObject *args)
7581{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007582 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007584 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007585 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007586 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007588 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007589 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007591 if (PyTuple_Check(subobj)) {
7592 Py_ssize_t i;
7593 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7594 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7595 PyTuple_GET_ITEM(subobj, i));
7596 if (substring == NULL)
7597 return NULL;
7598 result = tailmatch(self, substring, start, end, -1);
7599 Py_DECREF(substring);
7600 if (result) {
7601 Py_RETURN_TRUE;
7602 }
7603 }
7604 /* nothing matched */
7605 Py_RETURN_FALSE;
7606 }
7607 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007609 return NULL;
7610 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007612 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613}
7614
7615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007616PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007617"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007619Return True if S ends with the specified suffix, False otherwise.\n\
7620With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007621With optional end, stop comparing S at that position.\n\
7622suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623
7624static PyObject *
7625unicode_endswith(PyUnicodeObject *self,
7626 PyObject *args)
7627{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007628 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007630 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007631 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007632 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007634 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7635 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007637 if (PyTuple_Check(subobj)) {
7638 Py_ssize_t i;
7639 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7640 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7641 PyTuple_GET_ITEM(subobj, i));
7642 if (substring == NULL)
7643 return NULL;
7644 result = tailmatch(self, substring, start, end, +1);
7645 Py_DECREF(substring);
7646 if (result) {
7647 Py_RETURN_TRUE;
7648 }
7649 }
7650 Py_RETURN_FALSE;
7651 }
7652 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007654 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007656 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007658 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659}
7660
7661
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007662
7663static PyObject *
7664unicode_getnewargs(PyUnicodeObject *v)
7665{
7666 return Py_BuildValue("(u#)", v->str, v->length);
7667}
7668
7669
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670static PyMethodDef unicode_methods[] = {
7671
7672 /* Order is according to common usage: often used methods should
7673 appear first, since lookup is done sequentially. */
7674
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007675 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7676 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7677 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007678 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007679 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7680 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7681 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7682 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7683 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7684 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7685 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007686 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007687 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7688 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7689 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007690 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007691 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007692/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7693 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7694 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7695 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007696 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007697 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007698 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007699 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007700 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7701 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7702 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7703 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7704 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7705 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7706 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7707 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7708 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7709 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7710 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7711 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7712 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7713 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007714 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007715#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007716 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717#endif
7718
7719#if 0
7720 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007721 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722#endif
7723
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007724 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725 {NULL, NULL}
7726};
7727
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007728static PyObject *
7729unicode_mod(PyObject *v, PyObject *w)
7730{
7731 if (!PyUnicode_Check(v)) {
7732 Py_INCREF(Py_NotImplemented);
7733 return Py_NotImplemented;
7734 }
7735 return PyUnicode_Format(v, w);
7736}
7737
7738static PyNumberMethods unicode_as_number = {
7739 0, /*nb_add*/
7740 0, /*nb_subtract*/
7741 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007742 unicode_mod, /*nb_remainder*/
7743};
7744
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007746 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007747 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007748 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7749 (ssizeargfunc) unicode_getitem, /* sq_item */
7750 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751 0, /* sq_ass_item */
7752 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007753 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754};
7755
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007756static PyObject*
7757unicode_subscript(PyUnicodeObject* self, PyObject* item)
7758{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007759 if (PyIndex_Check(item)) {
7760 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007761 if (i == -1 && PyErr_Occurred())
7762 return NULL;
7763 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007764 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007765 return unicode_getitem(self, i);
7766 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007767 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007768 Py_UNICODE* source_buf;
7769 Py_UNICODE* result_buf;
7770 PyObject* result;
7771
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007772 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007773 &start, &stop, &step, &slicelength) < 0) {
7774 return NULL;
7775 }
7776
7777 if (slicelength <= 0) {
7778 return PyUnicode_FromUnicode(NULL, 0);
7779 } else {
7780 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007781 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7782 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007783
7784 if (result_buf == NULL)
7785 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007786
7787 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7788 result_buf[i] = source_buf[cur];
7789 }
Tim Petersced69f82003-09-16 20:30:58 +00007790
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007791 result = PyUnicode_FromUnicode(result_buf, slicelength);
7792 PyMem_FREE(result_buf);
7793 return result;
7794 }
7795 } else {
7796 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7797 return NULL;
7798 }
7799}
7800
7801static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007802 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007803 (binaryfunc)unicode_subscript, /* mp_subscript */
7804 (objobjargproc)0, /* mp_ass_subscript */
7805};
7806
Martin v. Löwis18e16552006-02-15 17:27:45 +00007807static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007809 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810 const void **ptr)
7811{
7812 if (index != 0) {
7813 PyErr_SetString(PyExc_SystemError,
7814 "accessing non-existent unicode segment");
7815 return -1;
7816 }
7817 *ptr = (void *) self->str;
7818 return PyUnicode_GET_DATA_SIZE(self);
7819}
7820
Martin v. Löwis18e16552006-02-15 17:27:45 +00007821static Py_ssize_t
7822unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823 const void **ptr)
7824{
7825 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007826 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007827 return -1;
7828}
7829
7830static int
7831unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007832 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833{
7834 if (lenp)
7835 *lenp = PyUnicode_GET_DATA_SIZE(self);
7836 return 1;
7837}
7838
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007839static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007841 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842 const void **ptr)
7843{
7844 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007845
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846 if (index != 0) {
7847 PyErr_SetString(PyExc_SystemError,
7848 "accessing non-existent unicode segment");
7849 return -1;
7850 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007851 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852 if (str == NULL)
7853 return -1;
7854 *ptr = (void *) PyString_AS_STRING(str);
7855 return PyString_GET_SIZE(str);
7856}
7857
7858/* Helpers for PyUnicode_Format() */
7859
7860static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007861getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007863 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864 if (argidx < arglen) {
7865 (*p_argidx)++;
7866 if (arglen < 0)
7867 return args;
7868 else
7869 return PyTuple_GetItem(args, argidx);
7870 }
7871 PyErr_SetString(PyExc_TypeError,
7872 "not enough arguments for format string");
7873 return NULL;
7874}
7875
7876#define F_LJUST (1<<0)
7877#define F_SIGN (1<<1)
7878#define F_BLANK (1<<2)
7879#define F_ALT (1<<3)
7880#define F_ZERO (1<<4)
7881
Martin v. Löwis18e16552006-02-15 17:27:45 +00007882static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007883strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007885 register Py_ssize_t i;
7886 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887 for (i = len - 1; i >= 0; i--)
7888 buffer[i] = (Py_UNICODE) charbuffer[i];
7889
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890 return len;
7891}
7892
Neal Norwitzfc76d632006-01-10 06:03:13 +00007893static int
7894doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7895{
Tim Peters15231542006-02-16 01:08:01 +00007896 Py_ssize_t result;
7897
Neal Norwitzfc76d632006-01-10 06:03:13 +00007898 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007899 result = strtounicode(buffer, (char *)buffer);
7900 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007901}
7902
7903static int
7904longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7905{
Tim Peters15231542006-02-16 01:08:01 +00007906 Py_ssize_t result;
7907
Neal Norwitzfc76d632006-01-10 06:03:13 +00007908 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007909 result = strtounicode(buffer, (char *)buffer);
7910 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007911}
7912
Guido van Rossum078151d2002-08-11 04:24:12 +00007913/* XXX To save some code duplication, formatfloat/long/int could have been
7914 shared with stringobject.c, converting from 8-bit to Unicode after the
7915 formatting is done. */
7916
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917static int
7918formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007919 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007920 int flags,
7921 int prec,
7922 int type,
7923 PyObject *v)
7924{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007925 /* fmt = '%#.' + `prec` + `type`
7926 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927 char fmt[20];
7928 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007929
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930 x = PyFloat_AsDouble(v);
7931 if (x == -1.0 && PyErr_Occurred())
7932 return -1;
7933 if (prec < 0)
7934 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007935 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7936 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007937 /* Worst case length calc to ensure no buffer overrun:
7938
7939 'g' formats:
7940 fmt = %#.<prec>g
7941 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7942 for any double rep.)
7943 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7944
7945 'f' formats:
7946 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7947 len = 1 + 50 + 1 + prec = 52 + prec
7948
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007949 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007950 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007951
7952 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00007953 if (((type == 'g' || type == 'G') &&
7954 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007955 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007956 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007957 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007958 return -1;
7959 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007960 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7961 (flags&F_ALT) ? "#" : "",
7962 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007963 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964}
7965
Tim Peters38fd5b62000-09-21 05:43:11 +00007966static PyObject*
7967formatlong(PyObject *val, int flags, int prec, int type)
7968{
7969 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00007970 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00007971 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00007972 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00007973
7974 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7975 if (!str)
7976 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00007977 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00007978 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00007979 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00007980}
7981
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982static int
7983formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007984 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 int flags,
7986 int prec,
7987 int type,
7988 PyObject *v)
7989{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007990 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007991 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7992 * + 1 + 1
7993 * = 24
7994 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007995 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007996 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997 long x;
7998
7999 x = PyInt_AsLong(v);
8000 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008001 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008002 if (x < 0 && type == 'u') {
8003 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008004 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008005 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8006 sign = "-";
8007 else
8008 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008010 prec = 1;
8011
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008012 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8013 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008014 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008015 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008016 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008017 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008018 return -1;
8019 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008020
8021 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008022 (type == 'x' || type == 'X' || type == 'o')) {
8023 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008024 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008025 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008026 * - when 0 is being converted, the C standard leaves off
8027 * the '0x' or '0X', which is inconsistent with other
8028 * %#x/%#X conversions and inconsistent with Python's
8029 * hex() function
8030 * - there are platforms that violate the standard and
8031 * convert 0 with the '0x' or '0X'
8032 * (Metrowerks, Compaq Tru64)
8033 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008034 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008035 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008036 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008037 * We can achieve the desired consistency by inserting our
8038 * own '0x' or '0X' prefix, and substituting %x/%X in place
8039 * of %#x/%#X.
8040 *
8041 * Note that this is the same approach as used in
8042 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008043 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008044 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8045 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008046 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008047 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008048 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8049 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008050 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008051 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008052 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008053 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008054 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008055 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056}
8057
8058static int
8059formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008060 size_t buflen,
8061 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008063 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008064 if (PyUnicode_Check(v)) {
8065 if (PyUnicode_GET_SIZE(v) != 1)
8066 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008068 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008070 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008071 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008072 goto onError;
8073 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8074 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075
8076 else {
8077 /* Integer input truncated to a character */
8078 long x;
8079 x = PyInt_AsLong(v);
8080 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008081 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008082#ifdef Py_UNICODE_WIDE
8083 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008084 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008085 "%c arg not in range(0x110000) "
8086 "(wide Python build)");
8087 return -1;
8088 }
8089#else
8090 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008091 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008092 "%c arg not in range(0x10000) "
8093 "(narrow Python build)");
8094 return -1;
8095 }
8096#endif
8097 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098 }
8099 buf[1] = '\0';
8100 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008101
8102 onError:
8103 PyErr_SetString(PyExc_TypeError,
8104 "%c requires int or char");
8105 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106}
8107
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008108/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8109
8110 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8111 chars are formatted. XXX This is a magic number. Each formatting
8112 routine does bounds checking to ensure no overflow, but a better
8113 solution may be to malloc a buffer of appropriate size for each
8114 format. For now, the current solution is sufficient.
8115*/
8116#define FORMATBUFLEN (size_t)120
8117
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118PyObject *PyUnicode_Format(PyObject *format,
8119 PyObject *args)
8120{
8121 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008122 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123 int args_owned = 0;
8124 PyUnicodeObject *result = NULL;
8125 PyObject *dict = NULL;
8126 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008127
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 if (format == NULL || args == NULL) {
8129 PyErr_BadInternalCall();
8130 return NULL;
8131 }
8132 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008133 if (uformat == NULL)
8134 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 fmt = PyUnicode_AS_UNICODE(uformat);
8136 fmtcnt = PyUnicode_GET_SIZE(uformat);
8137
8138 reslen = rescnt = fmtcnt + 100;
8139 result = _PyUnicode_New(reslen);
8140 if (result == NULL)
8141 goto onError;
8142 res = PyUnicode_AS_UNICODE(result);
8143
8144 if (PyTuple_Check(args)) {
8145 arglen = PyTuple_Size(args);
8146 argidx = 0;
8147 }
8148 else {
8149 arglen = -1;
8150 argidx = -2;
8151 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008152 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008153 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154 dict = args;
8155
8156 while (--fmtcnt >= 0) {
8157 if (*fmt != '%') {
8158 if (--rescnt < 0) {
8159 rescnt = fmtcnt + 100;
8160 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008161 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008162 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8164 --rescnt;
8165 }
8166 *res++ = *fmt++;
8167 }
8168 else {
8169 /* Got a format specifier */
8170 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008171 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173 Py_UNICODE c = '\0';
8174 Py_UNICODE fill;
8175 PyObject *v = NULL;
8176 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008177 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008179 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008180 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181
8182 fmt++;
8183 if (*fmt == '(') {
8184 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008185 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186 PyObject *key;
8187 int pcount = 1;
8188
8189 if (dict == NULL) {
8190 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008191 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192 goto onError;
8193 }
8194 ++fmt;
8195 --fmtcnt;
8196 keystart = fmt;
8197 /* Skip over balanced parentheses */
8198 while (pcount > 0 && --fmtcnt >= 0) {
8199 if (*fmt == ')')
8200 --pcount;
8201 else if (*fmt == '(')
8202 ++pcount;
8203 fmt++;
8204 }
8205 keylen = fmt - keystart - 1;
8206 if (fmtcnt < 0 || pcount > 0) {
8207 PyErr_SetString(PyExc_ValueError,
8208 "incomplete format key");
8209 goto onError;
8210 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008211#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008212 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213 then looked up since Python uses strings to hold
8214 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008215 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 key = PyUnicode_EncodeUTF8(keystart,
8217 keylen,
8218 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008219#else
8220 key = PyUnicode_FromUnicode(keystart, keylen);
8221#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222 if (key == NULL)
8223 goto onError;
8224 if (args_owned) {
8225 Py_DECREF(args);
8226 args_owned = 0;
8227 }
8228 args = PyObject_GetItem(dict, key);
8229 Py_DECREF(key);
8230 if (args == NULL) {
8231 goto onError;
8232 }
8233 args_owned = 1;
8234 arglen = -1;
8235 argidx = -2;
8236 }
8237 while (--fmtcnt >= 0) {
8238 switch (c = *fmt++) {
8239 case '-': flags |= F_LJUST; continue;
8240 case '+': flags |= F_SIGN; continue;
8241 case ' ': flags |= F_BLANK; continue;
8242 case '#': flags |= F_ALT; continue;
8243 case '0': flags |= F_ZERO; continue;
8244 }
8245 break;
8246 }
8247 if (c == '*') {
8248 v = getnextarg(args, arglen, &argidx);
8249 if (v == NULL)
8250 goto onError;
8251 if (!PyInt_Check(v)) {
8252 PyErr_SetString(PyExc_TypeError,
8253 "* wants int");
8254 goto onError;
8255 }
8256 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008257 if (width == -1 && PyErr_Occurred())
8258 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259 if (width < 0) {
8260 flags |= F_LJUST;
8261 width = -width;
8262 }
8263 if (--fmtcnt >= 0)
8264 c = *fmt++;
8265 }
8266 else if (c >= '0' && c <= '9') {
8267 width = c - '0';
8268 while (--fmtcnt >= 0) {
8269 c = *fmt++;
8270 if (c < '0' || c > '9')
8271 break;
8272 if ((width*10) / 10 != width) {
8273 PyErr_SetString(PyExc_ValueError,
8274 "width too big");
8275 goto onError;
8276 }
8277 width = width*10 + (c - '0');
8278 }
8279 }
8280 if (c == '.') {
8281 prec = 0;
8282 if (--fmtcnt >= 0)
8283 c = *fmt++;
8284 if (c == '*') {
8285 v = getnextarg(args, arglen, &argidx);
8286 if (v == NULL)
8287 goto onError;
8288 if (!PyInt_Check(v)) {
8289 PyErr_SetString(PyExc_TypeError,
8290 "* wants int");
8291 goto onError;
8292 }
8293 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008294 if (prec == -1 && PyErr_Occurred())
8295 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296 if (prec < 0)
8297 prec = 0;
8298 if (--fmtcnt >= 0)
8299 c = *fmt++;
8300 }
8301 else if (c >= '0' && c <= '9') {
8302 prec = c - '0';
8303 while (--fmtcnt >= 0) {
8304 c = Py_CHARMASK(*fmt++);
8305 if (c < '0' || c > '9')
8306 break;
8307 if ((prec*10) / 10 != prec) {
8308 PyErr_SetString(PyExc_ValueError,
8309 "prec too big");
8310 goto onError;
8311 }
8312 prec = prec*10 + (c - '0');
8313 }
8314 }
8315 } /* prec */
8316 if (fmtcnt >= 0) {
8317 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318 if (--fmtcnt >= 0)
8319 c = *fmt++;
8320 }
8321 }
8322 if (fmtcnt < 0) {
8323 PyErr_SetString(PyExc_ValueError,
8324 "incomplete format");
8325 goto onError;
8326 }
8327 if (c != '%') {
8328 v = getnextarg(args, arglen, &argidx);
8329 if (v == NULL)
8330 goto onError;
8331 }
8332 sign = 0;
8333 fill = ' ';
8334 switch (c) {
8335
8336 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008337 pbuf = formatbuf;
8338 /* presume that buffer length is at least 1 */
8339 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340 len = 1;
8341 break;
8342
8343 case 's':
8344 case 'r':
8345 if (PyUnicode_Check(v) && c == 's') {
8346 temp = v;
8347 Py_INCREF(temp);
8348 }
8349 else {
8350 PyObject *unicode;
8351 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008352 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353 else
8354 temp = PyObject_Repr(v);
8355 if (temp == NULL)
8356 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008357 if (PyUnicode_Check(temp))
8358 /* nothing to do */;
8359 else if (PyString_Check(temp)) {
8360 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008361 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008363 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008365 Py_DECREF(temp);
8366 temp = unicode;
8367 if (temp == NULL)
8368 goto onError;
8369 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008370 else {
8371 Py_DECREF(temp);
8372 PyErr_SetString(PyExc_TypeError,
8373 "%s argument has non-string str()");
8374 goto onError;
8375 }
8376 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008377 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 len = PyUnicode_GET_SIZE(temp);
8379 if (prec >= 0 && len > prec)
8380 len = prec;
8381 break;
8382
8383 case 'i':
8384 case 'd':
8385 case 'u':
8386 case 'o':
8387 case 'x':
8388 case 'X':
8389 if (c == 'i')
8390 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008391 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008392 temp = formatlong(v, flags, prec, c);
8393 if (!temp)
8394 goto onError;
8395 pbuf = PyUnicode_AS_UNICODE(temp);
8396 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008397 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008399 else {
8400 pbuf = formatbuf;
8401 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8402 flags, prec, c, v);
8403 if (len < 0)
8404 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008405 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008406 }
8407 if (flags & F_ZERO)
8408 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008409 break;
8410
8411 case 'e':
8412 case 'E':
8413 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008414 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008415 case 'g':
8416 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008417 if (c == 'F')
8418 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008419 pbuf = formatbuf;
8420 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8421 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422 if (len < 0)
8423 goto onError;
8424 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008425 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426 fill = '0';
8427 break;
8428
8429 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008430 pbuf = formatbuf;
8431 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 if (len < 0)
8433 goto onError;
8434 break;
8435
8436 default:
8437 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008438 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008439 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008440 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008441 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008442 (Py_ssize_t)(fmt - 1 -
8443 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444 goto onError;
8445 }
8446 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008447 if (*pbuf == '-' || *pbuf == '+') {
8448 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449 len--;
8450 }
8451 else if (flags & F_SIGN)
8452 sign = '+';
8453 else if (flags & F_BLANK)
8454 sign = ' ';
8455 else
8456 sign = 0;
8457 }
8458 if (width < len)
8459 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008460 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461 reslen -= rescnt;
8462 rescnt = width + fmtcnt + 100;
8463 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008464 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008465 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008466 PyErr_NoMemory();
8467 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008468 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008469 if (_PyUnicode_Resize(&result, reslen) < 0) {
8470 Py_XDECREF(temp);
8471 goto onError;
8472 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008473 res = PyUnicode_AS_UNICODE(result)
8474 + reslen - rescnt;
8475 }
8476 if (sign) {
8477 if (fill != ' ')
8478 *res++ = sign;
8479 rescnt--;
8480 if (width > len)
8481 width--;
8482 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008483 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008484 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008485 assert(pbuf[1] == c);
8486 if (fill != ' ') {
8487 *res++ = *pbuf++;
8488 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008489 }
Tim Petersfff53252001-04-12 18:38:48 +00008490 rescnt -= 2;
8491 width -= 2;
8492 if (width < 0)
8493 width = 0;
8494 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496 if (width > len && !(flags & F_LJUST)) {
8497 do {
8498 --rescnt;
8499 *res++ = fill;
8500 } while (--width > len);
8501 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008502 if (fill == ' ') {
8503 if (sign)
8504 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008505 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008506 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008507 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008508 *res++ = *pbuf++;
8509 *res++ = *pbuf++;
8510 }
8511 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008512 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008513 res += len;
8514 rescnt -= len;
8515 while (--width >= len) {
8516 --rescnt;
8517 *res++ = ' ';
8518 }
8519 if (dict && (argidx < arglen) && c != '%') {
8520 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008521 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008522 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523 goto onError;
8524 }
8525 Py_XDECREF(temp);
8526 } /* '%' */
8527 } /* until end */
8528 if (argidx < arglen && !dict) {
8529 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008530 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531 goto onError;
8532 }
8533
Thomas Woutersa96affe2006-03-12 00:29:36 +00008534 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8535 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536 if (args_owned) {
8537 Py_DECREF(args);
8538 }
8539 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540 return (PyObject *)result;
8541
8542 onError:
8543 Py_XDECREF(result);
8544 Py_DECREF(uformat);
8545 if (args_owned) {
8546 Py_DECREF(args);
8547 }
8548 return NULL;
8549}
8550
8551static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008552 (readbufferproc) unicode_buffer_getreadbuf,
8553 (writebufferproc) unicode_buffer_getwritebuf,
8554 (segcountproc) unicode_buffer_getsegcount,
8555 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556};
8557
Jeremy Hylton938ace62002-07-17 16:30:39 +00008558static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008559unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8560
Tim Peters6d6c1a32001-08-02 04:15:00 +00008561static PyObject *
8562unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8563{
8564 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008565 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008566 char *encoding = NULL;
8567 char *errors = NULL;
8568
Guido van Rossume023fe02001-08-30 03:12:59 +00008569 if (type != &PyUnicode_Type)
8570 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008571 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8572 kwlist, &x, &encoding, &errors))
8573 return NULL;
8574 if (x == NULL)
8575 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008576 if (encoding == NULL && errors == NULL)
8577 return PyObject_Unicode(x);
8578 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008579 return PyUnicode_FromEncodedObject(x, encoding, errors);
8580}
8581
Guido van Rossume023fe02001-08-30 03:12:59 +00008582static PyObject *
8583unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8584{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008585 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008586 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008587
8588 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8589 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8590 if (tmp == NULL)
8591 return NULL;
8592 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008593 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008594 if (pnew == NULL) {
8595 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008596 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008597 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008598 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8599 if (pnew->str == NULL) {
8600 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008601 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008602 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008603 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008604 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008605 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8606 pnew->length = n;
8607 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008608 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008609 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008610}
8611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008612PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008613"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008614\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008615Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008616encoding defaults to the current default string encoding.\n\
8617errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008618
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008619static PyObject *unicode_iter(PyObject *seq);
8620
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008622 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008623 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624 sizeof(PyUnicodeObject), /* tp_size */
8625 0, /* tp_itemsize */
8626 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008627 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008629 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008631 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008632 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008633 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008635 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636 (hashfunc) unicode_hash, /* tp_hash*/
8637 0, /* tp_call*/
8638 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008639 PyObject_GenericGetAttr, /* tp_getattro */
8640 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008642 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8643 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008644 unicode_doc, /* tp_doc */
8645 0, /* tp_traverse */
8646 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008647 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008648 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008649 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008650 0, /* tp_iternext */
8651 unicode_methods, /* tp_methods */
8652 0, /* tp_members */
8653 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008654 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008655 0, /* tp_dict */
8656 0, /* tp_descr_get */
8657 0, /* tp_descr_set */
8658 0, /* tp_dictoffset */
8659 0, /* tp_init */
8660 0, /* tp_alloc */
8661 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008662 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663};
8664
8665/* Initialize the Unicode implementation */
8666
Thomas Wouters78890102000-07-22 19:25:51 +00008667void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008669 int i;
8670
Thomas Wouters477c8d52006-05-27 19:21:47 +00008671 /* XXX - move this array to unicodectype.c ? */
8672 Py_UNICODE linebreak[] = {
8673 0x000A, /* LINE FEED */
8674 0x000D, /* CARRIAGE RETURN */
8675 0x001C, /* FILE SEPARATOR */
8676 0x001D, /* GROUP SEPARATOR */
8677 0x001E, /* RECORD SEPARATOR */
8678 0x0085, /* NEXT LINE */
8679 0x2028, /* LINE SEPARATOR */
8680 0x2029, /* PARAGRAPH SEPARATOR */
8681 };
8682
Fred Drakee4315f52000-05-09 19:53:39 +00008683 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008684 unicode_freelist = NULL;
8685 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008687 if (!unicode_empty)
8688 return;
8689
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008690 for (i = 0; i < 256; i++)
8691 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008692 if (PyType_Ready(&PyUnicode_Type) < 0)
8693 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008694
8695 /* initialize the linebreak bloom filter */
8696 bloom_linebreak = make_bloom_mask(
8697 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8698 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008699
8700 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701}
8702
8703/* Finalize the Unicode implementation */
8704
8705void
Thomas Wouters78890102000-07-22 19:25:51 +00008706_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008707{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008708 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008709 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008711 Py_XDECREF(unicode_empty);
8712 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008713
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008714 for (i = 0; i < 256; i++) {
8715 if (unicode_latin1[i]) {
8716 Py_DECREF(unicode_latin1[i]);
8717 unicode_latin1[i] = NULL;
8718 }
8719 }
8720
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008721 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722 PyUnicodeObject *v = u;
8723 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008724 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008725 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008726 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008727 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008728 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008729 unicode_freelist = NULL;
8730 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008732
Walter Dörwald16807132007-05-25 13:52:07 +00008733void
8734PyUnicode_InternInPlace(PyObject **p)
8735{
8736 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8737 PyObject *t;
8738 if (s == NULL || !PyUnicode_Check(s))
8739 Py_FatalError(
8740 "PyUnicode_InternInPlace: unicode strings only please!");
8741 /* If it's a subclass, we don't really know what putting
8742 it in the interned dict might do. */
8743 if (!PyUnicode_CheckExact(s))
8744 return;
8745 if (PyUnicode_CHECK_INTERNED(s))
8746 return;
8747 if (interned == NULL) {
8748 interned = PyDict_New();
8749 if (interned == NULL) {
8750 PyErr_Clear(); /* Don't leave an exception */
8751 return;
8752 }
8753 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008754 /* It might be that the GetItem call fails even
8755 though the key is present in the dictionary,
8756 namely when this happens during a stack overflow. */
8757 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00008758 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00008759 Py_END_ALLOW_RECURSION
8760
Walter Dörwald16807132007-05-25 13:52:07 +00008761 if (t) {
8762 Py_INCREF(t);
8763 Py_DECREF(*p);
8764 *p = t;
8765 return;
8766 }
8767
Martin v. Löwis5b222132007-06-10 09:51:05 +00008768 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008769 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
8770 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00008771 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008772 return;
8773 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008774 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008775 /* The two references in interned are not counted by refcnt.
8776 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008777 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00008778 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
8779}
8780
8781void
8782PyUnicode_InternImmortal(PyObject **p)
8783{
8784 PyUnicode_InternInPlace(p);
8785 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
8786 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
8787 Py_INCREF(*p);
8788 }
8789}
8790
8791PyObject *
8792PyUnicode_InternFromString(const char *cp)
8793{
8794 PyObject *s = PyUnicode_FromString(cp);
8795 if (s == NULL)
8796 return NULL;
8797 PyUnicode_InternInPlace(&s);
8798 return s;
8799}
8800
8801void _Py_ReleaseInternedUnicodeStrings(void)
8802{
8803 PyObject *keys;
8804 PyUnicodeObject *s;
8805 Py_ssize_t i, n;
8806 Py_ssize_t immortal_size = 0, mortal_size = 0;
8807
8808 if (interned == NULL || !PyDict_Check(interned))
8809 return;
8810 keys = PyDict_Keys(interned);
8811 if (keys == NULL || !PyList_Check(keys)) {
8812 PyErr_Clear();
8813 return;
8814 }
8815
8816 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
8817 detector, interned unicode strings are not forcibly deallocated;
8818 rather, we give them their stolen references back, and then clear
8819 and DECREF the interned dict. */
8820
8821 n = PyList_GET_SIZE(keys);
8822 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
8823 n);
8824 for (i = 0; i < n; i++) {
8825 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
8826 switch (s->state) {
8827 case SSTATE_NOT_INTERNED:
8828 /* XXX Shouldn't happen */
8829 break;
8830 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008831 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008832 immortal_size += s->length;
8833 break;
8834 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008835 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00008836 mortal_size += s->length;
8837 break;
8838 default:
8839 Py_FatalError("Inconsistent interned string state.");
8840 }
8841 s->state = SSTATE_NOT_INTERNED;
8842 }
8843 fprintf(stderr, "total size of all interned strings: "
8844 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
8845 "mortal/immortal\n", mortal_size, immortal_size);
8846 Py_DECREF(keys);
8847 PyDict_Clear(interned);
8848 Py_DECREF(interned);
8849 interned = NULL;
8850}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008851
8852
8853/********************* Unicode Iterator **************************/
8854
8855typedef struct {
8856 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008857 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008858 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8859} unicodeiterobject;
8860
8861static void
8862unicodeiter_dealloc(unicodeiterobject *it)
8863{
8864 _PyObject_GC_UNTRACK(it);
8865 Py_XDECREF(it->it_seq);
8866 PyObject_GC_Del(it);
8867}
8868
8869static int
8870unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8871{
8872 Py_VISIT(it->it_seq);
8873 return 0;
8874}
8875
8876static PyObject *
8877unicodeiter_next(unicodeiterobject *it)
8878{
8879 PyUnicodeObject *seq;
8880 PyObject *item;
8881
8882 assert(it != NULL);
8883 seq = it->it_seq;
8884 if (seq == NULL)
8885 return NULL;
8886 assert(PyUnicode_Check(seq));
8887
8888 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008889 item = PyUnicode_FromUnicode(
8890 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008891 if (item != NULL)
8892 ++it->it_index;
8893 return item;
8894 }
8895
8896 Py_DECREF(seq);
8897 it->it_seq = NULL;
8898 return NULL;
8899}
8900
8901static PyObject *
8902unicodeiter_len(unicodeiterobject *it)
8903{
8904 Py_ssize_t len = 0;
8905 if (it->it_seq)
8906 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8907 return PyInt_FromSsize_t(len);
8908}
8909
8910PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8911
8912static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008913 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8914 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008915 {NULL, NULL} /* sentinel */
8916};
8917
8918PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008919 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008920 "unicodeiterator", /* tp_name */
8921 sizeof(unicodeiterobject), /* tp_basicsize */
8922 0, /* tp_itemsize */
8923 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008924 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008925 0, /* tp_print */
8926 0, /* tp_getattr */
8927 0, /* tp_setattr */
8928 0, /* tp_compare */
8929 0, /* tp_repr */
8930 0, /* tp_as_number */
8931 0, /* tp_as_sequence */
8932 0, /* tp_as_mapping */
8933 0, /* tp_hash */
8934 0, /* tp_call */
8935 0, /* tp_str */
8936 PyObject_GenericGetAttr, /* tp_getattro */
8937 0, /* tp_setattro */
8938 0, /* tp_as_buffer */
8939 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8940 0, /* tp_doc */
8941 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8942 0, /* tp_clear */
8943 0, /* tp_richcompare */
8944 0, /* tp_weaklistoffset */
8945 PyObject_SelfIter, /* tp_iter */
8946 (iternextfunc)unicodeiter_next, /* tp_iternext */
8947 unicodeiter_methods, /* tp_methods */
8948 0,
8949};
8950
8951static PyObject *
8952unicode_iter(PyObject *seq)
8953{
8954 unicodeiterobject *it;
8955
8956 if (!PyUnicode_Check(seq)) {
8957 PyErr_BadInternalCall();
8958 return NULL;
8959 }
8960 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8961 if (it == NULL)
8962 return NULL;
8963 it->it_index = 0;
8964 Py_INCREF(seq);
8965 it->it_seq = (PyUnicodeObject *)seq;
8966 _PyObject_GC_TRACK(it);
8967 return (PyObject *)it;
8968}
8969
Martin v. Löwis5b222132007-06-10 09:51:05 +00008970size_t
8971Py_UNICODE_strlen(const Py_UNICODE *u)
8972{
8973 int res = 0;
8974 while(*u++)
8975 res++;
8976 return res;
8977}
8978
8979Py_UNICODE*
8980Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
8981{
8982 Py_UNICODE *u = s1;
8983 while ((*u++ = *s2++));
8984 return s1;
8985}
8986
8987Py_UNICODE*
8988Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
8989{
8990 Py_UNICODE *u = s1;
8991 while ((*u++ = *s2++))
8992 if (n-- == 0)
8993 break;
8994 return s1;
8995}
8996
8997int
8998Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
8999{
9000 while (*s1 && *s2 && *s1 == *s2)
9001 s1++, s2++;
9002 if (*s1 && *s2)
9003 return (*s1 < *s2) ? -1 : +1;
9004 if (*s1)
9005 return 1;
9006 if (*s2)
9007 return -1;
9008 return 0;
9009}
9010
9011Py_UNICODE*
9012Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9013{
9014 const Py_UNICODE *p;
9015 for (p = s; *p; p++)
9016 if (*p == c)
9017 return (Py_UNICODE*)p;
9018 return NULL;
9019}
9020
9021
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009022#ifdef __cplusplus
9023}
9024#endif
9025
9026
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009027/*
9028Local variables:
9029c-basic-offset: 4
9030indent-tabs-mode: nil
9031End:
9032*/