blob: d1b5747f5798ccbf3e5e655910a727088ffc31e2 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
101 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000106static PyUnicodeObject *unicode_freelist;
107static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Fred Drakee4315f52000-05-09 19:53:39 +0000116/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000117 parameter; it is fixed to "utf-8". Always use the
118 PyUnicode_GetDefaultEncoding() API to access this global. */
119static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000120
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000121Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000122PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000124#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125 return 0x10FFFF;
126#else
127 /* This is actually an illegal character, so it should
128 not be passed to unichr. */
129 return 0xFFFF;
130#endif
131}
132
Thomas Wouters477c8d52006-05-27 19:21:47 +0000133/* --- Bloom Filters ----------------------------------------------------- */
134
135/* stuff to implement simple "bloom filters" for Unicode characters.
136 to keep things simple, we use a single bitmask, using the least 5
137 bits from each unicode characters as the bit index. */
138
139/* the linebreak mask is set up by Unicode_Init below */
140
141#define BLOOM_MASK unsigned long
142
143static BLOOM_MASK bloom_linebreak;
144
145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
146
147#define BLOOM_LINEBREAK(ch)\
148 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
149
150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
151{
152 /* calculate simple bloom-style bitmask for a given unicode string */
153
154 long mask;
155 Py_ssize_t i;
156
157 mask = 0;
158 for (i = 0; i < len; i++)
159 mask |= (1 << (ptr[i] & 0x1F));
160
161 return mask;
162}
163
164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
165{
166 Py_ssize_t i;
167
168 for (i = 0; i < setlen; i++)
169 if (set[i] == chr)
170 return 1;
171
172 return 0;
173}
174
175#define BLOOM_MEMBER(mask, chr, set, setlen)\
176 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
177
Guido van Rossumd57fd912000-03-10 22:53:23 +0000178/* --- Unicode Object ----------------------------------------------------- */
179
180static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000182 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183{
184 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000185
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000186 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190 /* Resizing shared object (unicode_empty or single character
191 objects) in-place is not allowed. Use PyUnicode_Resize()
192 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000194 if (unicode == unicode_empty ||
195 (unicode->length == 1 &&
196 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 return -1;
201 }
202
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203 /* We allocate one more byte to make sure the string is Ux0000 terminated.
204 The overallocation is also used by fastsearch, which assumes that it's
205 safe to look at str[length] (without making any assumptions about what
206 it contains). */
207
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 oldstr = unicode->str;
209 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
210 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 PyErr_NoMemory();
213 return -1;
214 }
215 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000216 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000220 if (unicode->defenc) {
221 Py_DECREF(unicode->defenc);
222 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 }
224 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000225
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 return 0;
227}
228
229/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000230 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231
232 XXX This allocator could further be enhanced by assuring that the
233 free list never reduces its size below 1.
234
235*/
236
237static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000238PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239{
240 register PyUnicodeObject *unicode;
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (length == 0 && unicode_empty != NULL) {
244 Py_INCREF(unicode_empty);
245 return unicode_empty;
246 }
247
248 /* Unicode freelist & memory allocation */
249 if (unicode_freelist) {
250 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000251 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 /* Keep-Alive optimization: we only upsize the buffer,
255 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000256 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000257 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000258 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000259 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000262 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000264 }
265 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 }
267 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000268 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 if (unicode == NULL)
270 return NULL;
271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
272 }
273
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000274 if (!unicode->str) {
275 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000276 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000278 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000279 * the caller fails before initializing str -- unicode_resize()
280 * reads str[0], and the Keep-Alive optimization can keep memory
281 * allocated for str alive across a call to unicode_dealloc(unicode).
282 * We don't want unicode_resize to read uninitialized memory in
283 * that case.
284 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000285 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000289 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000292
293 onError:
294 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000295 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297}
298
299static
Guido van Rossum9475a232001-10-05 20:51:39 +0000300void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
Walter Dörwald16807132007-05-25 13:52:07 +0000302 switch (PyUnicode_CHECK_INTERNED(unicode)) {
303 case SSTATE_NOT_INTERNED:
304 break;
305
306 case SSTATE_INTERNED_MORTAL:
307 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000308 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000309 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
310 Py_FatalError(
311 "deletion of interned unicode string failed");
312 break;
313
314 case SSTATE_INTERNED_IMMORTAL:
315 Py_FatalError("Immortal interned unicode string died.");
316
317 default:
318 Py_FatalError("Inconsistent interned unicode string state.");
319 }
320
Guido van Rossum604ddf82001-12-06 20:03:56 +0000321 if (PyUnicode_CheckExact(unicode) &&
322 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000323 /* Keep-Alive optimization */
324 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000325 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 unicode->str = NULL;
327 unicode->length = 0;
328 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000329 if (unicode->defenc) {
330 Py_DECREF(unicode->defenc);
331 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000332 }
333 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 *(PyUnicodeObject **)unicode = unicode_freelist;
335 unicode_freelist = unicode;
336 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 }
338 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000339 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000340 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000341 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 }
343}
344
Martin v. Löwis18e16552006-02-15 17:27:45 +0000345int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000346{
347 register PyUnicodeObject *v;
348
349 /* Argument checks */
350 if (unicode == NULL) {
351 PyErr_BadInternalCall();
352 return -1;
353 }
354 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000355 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000356 PyErr_BadInternalCall();
357 return -1;
358 }
359
360 /* Resizing unicode_empty and single character objects is not
361 possible since these are being shared. We simply return a fresh
362 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000363 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 (v == unicode_empty || v->length == 1)) {
365 PyUnicodeObject *w = _PyUnicode_New(length);
366 if (w == NULL)
367 return -1;
368 Py_UNICODE_COPY(w->str, v->str,
369 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000370 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000371 *unicode = (PyObject *)w;
372 return 0;
373 }
374
375 /* Note that we don't have to modify *unicode for unshared Unicode
376 objects, since we can modify them in-place. */
377 return unicode_resize(v, length);
378}
379
380/* Internal API for use in unicodeobject.c only ! */
381#define _PyUnicode_Resize(unicodevar, length) \
382 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
383
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000385 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386{
387 PyUnicodeObject *unicode;
388
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000389 /* If the Unicode data is known at construction time, we can apply
390 some optimizations which share commonly used objects. */
391 if (u != NULL) {
392
393 /* Optimization for empty strings */
394 if (size == 0 && unicode_empty != NULL) {
395 Py_INCREF(unicode_empty);
396 return (PyObject *)unicode_empty;
397 }
398
399 /* Single character Unicode objects in the Latin-1 range are
400 shared when using this constructor */
401 if (size == 1 && *u < 256) {
402 unicode = unicode_latin1[*u];
403 if (!unicode) {
404 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000405 if (!unicode)
406 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000407 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 unicode_latin1[*u] = unicode;
409 }
410 Py_INCREF(unicode);
411 return (PyObject *)unicode;
412 }
413 }
Tim Petersced69f82003-09-16 20:30:58 +0000414
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 unicode = _PyUnicode_New(size);
416 if (!unicode)
417 return NULL;
418
419 /* Copy the Unicode data into the new object */
420 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000421 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000422
423 return (PyObject *)unicode;
424}
425
Walter Dörwaldd2034312007-05-18 16:29:38 +0000426PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000427{
428 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000429 /* If the Unicode data is known at construction time, we can apply
430 some optimizations which share commonly used objects. */
431 if (u != NULL) {
432
433 /* Optimization for empty strings */
434 if (size == 0 && unicode_empty != NULL) {
435 Py_INCREF(unicode_empty);
436 return (PyObject *)unicode_empty;
437 }
438
Walter Dörwald071b9da2007-05-05 14:21:20 +0000439 /* Single characters are shared when using this constructor */
440 if (size == 1) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000441 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000442 if (!unicode) {
443 unicode = _PyUnicode_New(1);
444 if (!unicode)
445 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000446 unicode->str[0] = Py_CHARMASK(*u);
447 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000448 }
449 Py_INCREF(unicode);
450 return (PyObject *)unicode;
451 }
452 }
453
Walter Dörwald55507312007-05-18 13:12:10 +0000454 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000455 if (!unicode)
456 return NULL;
457
458 /* Copy the Unicode data into the new object */
459 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000460 Py_UNICODE *p = unicode->str;
Martin v. Löwis5b222132007-06-10 09:51:05 +0000461 while (size--)
Guido van Rossum00058aa2007-07-19 18:21:28 +0000462 *p++ = Py_CHARMASK(*u++);
Martin v. Löwis5b222132007-06-10 09:51:05 +0000463 /* Don't need to write trailing 0 because
464 that's already done by _PyUnicode_New */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000465 }
466
467 return (PyObject *)unicode;
468}
469
Walter Dörwaldd2034312007-05-18 16:29:38 +0000470PyObject *PyUnicode_FromString(const char *u)
471{
472 size_t size = strlen(u);
473 if (size > PY_SSIZE_T_MAX) {
474 PyErr_SetString(PyExc_OverflowError, "input too long");
475 return NULL;
476 }
477
478 return PyUnicode_FromStringAndSize(u, size);
479}
480
Guido van Rossumd57fd912000-03-10 22:53:23 +0000481#ifdef HAVE_WCHAR_H
482
483PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000484 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485{
486 PyUnicodeObject *unicode;
487
488 if (w == NULL) {
489 PyErr_BadInternalCall();
490 return NULL;
491 }
492
493 unicode = _PyUnicode_New(size);
494 if (!unicode)
495 return NULL;
496
497 /* Copy the wchar_t data into the new object */
498#ifdef HAVE_USABLE_WCHAR_T
499 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000500#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501 {
502 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000503 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000505 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 *u++ = *w++;
507 }
508#endif
509
510 return (PyObject *)unicode;
511}
512
Walter Dörwald346737f2007-05-31 10:44:43 +0000513static void
514makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
515{
516 *fmt++ = '%';
517 if (width) {
518 if (zeropad)
519 *fmt++ = '0';
520 fmt += sprintf(fmt, "%d", width);
521 }
522 if (precision)
523 fmt += sprintf(fmt, ".%d", precision);
524 if (longflag)
525 *fmt++ = 'l';
526 else if (size_tflag) {
527 char *f = PY_FORMAT_SIZE_T;
528 while (*f)
529 *fmt++ = *f++;
530 }
531 *fmt++ = c;
532 *fmt = '\0';
533}
534
Walter Dörwaldd2034312007-05-18 16:29:38 +0000535#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
536
537PyObject *
538PyUnicode_FromFormatV(const char *format, va_list vargs)
539{
540 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000541 Py_ssize_t callcount = 0;
542 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000543 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000544 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000545 int width = 0;
546 int precision = 0;
547 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000548 const char* f;
549 Py_UNICODE *s;
550 PyObject *string;
551 /* used by sprintf */
552 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000553 /* use abuffer instead of buffer, if we need more space
554 * (which can happen if there's a format specifier with width). */
555 char *abuffer = NULL;
556 char *realbuffer;
557 Py_ssize_t abuffersize = 0;
558 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000559 const char *copy;
560
561#ifdef VA_LIST_IS_ARRAY
562 Py_MEMCPY(count, vargs, sizeof(va_list));
563#else
564#ifdef __va_copy
565 __va_copy(count, vargs);
566#else
567 count = vargs;
568#endif
569#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000570 /* step 1: count the number of %S/%R format specifications
571 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
572 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000573 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000574 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000575 ++callcount;
576 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000577 /* step 2: allocate memory for the results of
578 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000579 if (callcount) {
580 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
581 if (!callresults) {
582 PyErr_NoMemory();
583 return NULL;
584 }
585 callresult = callresults;
586 }
587 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000588 for (f = format; *f; f++) {
589 if (*f == '%') {
590 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000591 width = 0;
592 while (isdigit(Py_CHARMASK(*f)))
593 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000594 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
595 ;
596
597 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
598 * they don't affect the amount of space we reserve.
599 */
600 if ((*f == 'l' || *f == 'z') &&
601 (f[1] == 'd' || f[1] == 'u'))
602 ++f;
603
604 switch (*f) {
605 case 'c':
606 (void)va_arg(count, int);
607 /* fall through... */
608 case '%':
609 n++;
610 break;
611 case 'd': case 'u': case 'i': case 'x':
612 (void) va_arg(count, int);
613 /* 20 bytes is enough to hold a 64-bit
614 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000615 This isn't enough for octal.
616 If a width is specified we need more
617 (which we allocate later). */
618 if (width < 20)
619 width = 20;
620 n += width;
621 if (abuffersize < width)
622 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000623 break;
624 case 's':
625 n += strlen(va_arg(count, char*));
626 break;
627 case 'U':
628 {
629 PyObject *obj = va_arg(count, PyObject *);
630 assert(obj && PyUnicode_Check(obj));
631 n += PyUnicode_GET_SIZE(obj);
632 break;
633 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000634 case 'V':
635 {
636 PyObject *obj = va_arg(count, PyObject *);
637 const char *str = va_arg(count, const char *);
638 assert(obj || str);
639 assert(!obj || PyUnicode_Check(obj));
640 if (obj)
641 n += PyUnicode_GET_SIZE(obj);
642 else
643 n += strlen(str);
644 break;
645 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000646 case 'S':
647 {
648 PyObject *obj = va_arg(count, PyObject *);
649 PyObject *str;
650 assert(obj);
651 str = PyObject_Unicode(obj);
652 if (!str)
653 goto fail;
654 n += PyUnicode_GET_SIZE(str);
655 /* Remember the str and switch to the next slot */
656 *callresult++ = str;
657 break;
658 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000659 case 'R':
660 {
661 PyObject *obj = va_arg(count, PyObject *);
662 PyObject *repr;
663 assert(obj);
664 repr = PyObject_Repr(obj);
665 if (!repr)
666 goto fail;
667 n += PyUnicode_GET_SIZE(repr);
668 /* Remember the repr and switch to the next slot */
669 *callresult++ = repr;
670 break;
671 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000672 case 'p':
673 (void) va_arg(count, int);
674 /* maximum 64-bit pointer representation:
675 * 0xffffffffffffffff
676 * so 19 characters is enough.
677 * XXX I count 18 -- what's the extra for?
678 */
679 n += 19;
680 break;
681 default:
682 /* if we stumble upon an unknown
683 formatting code, copy the rest of
684 the format string to the output
685 string. (we cannot just skip the
686 code, since there's no way to know
687 what's in the argument list) */
688 n += strlen(p);
689 goto expand;
690 }
691 } else
692 n++;
693 }
694 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000695 if (abuffersize > 20) {
696 abuffer = PyMem_Malloc(abuffersize);
697 if (!abuffer) {
698 PyErr_NoMemory();
699 goto fail;
700 }
701 realbuffer = abuffer;
702 }
703 else
704 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000705 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000706 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000707 we don't have to resize the string.
708 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000709 string = PyUnicode_FromUnicode(NULL, n);
710 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000711 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000712
713 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000714 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000715
716 for (f = format; *f; f++) {
717 if (*f == '%') {
718 const char* p = f++;
719 int longflag = 0;
720 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000721 zeropad = (*f == '0');
722 /* parse the width.precision part */
723 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000725 width = (width*10) + *f++ - '0';
726 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000727 if (*f == '.') {
728 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000729 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000730 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000731 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000732 /* handle the long flag, but only for %ld and %lu.
733 others can be added when necessary. */
734 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
735 longflag = 1;
736 ++f;
737 }
738 /* handle the size_t flag. */
739 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
740 size_tflag = 1;
741 ++f;
742 }
743
744 switch (*f) {
745 case 'c':
746 *s++ = va_arg(vargs, int);
747 break;
748 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000749 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000750 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000751 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000752 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000753 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000754 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000755 sprintf(realbuffer, fmt, va_arg(vargs, int));
756 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757 break;
758 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000759 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000760 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000761 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000763 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000764 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000765 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
766 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767 break;
768 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000769 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
770 sprintf(realbuffer, fmt, va_arg(vargs, int));
771 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000772 break;
773 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000774 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
775 sprintf(realbuffer, fmt, va_arg(vargs, int));
776 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000777 break;
778 case 's':
779 p = va_arg(vargs, char*);
780 appendstring(p);
781 break;
782 case 'U':
783 {
784 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000785 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
786 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
787 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000788 break;
789 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000790 case 'V':
791 {
792 PyObject *obj = va_arg(vargs, PyObject *);
793 const char *str = va_arg(vargs, const char *);
794 if (obj) {
795 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
796 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
797 s += size;
798 } else {
799 appendstring(str);
800 }
801 break;
802 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000803 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000804 case 'R':
805 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000806 Py_UNICODE *ucopy;
807 Py_ssize_t usize;
808 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000809 /* unused, since we already have the result */
810 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000811 ucopy = PyUnicode_AS_UNICODE(*callresult);
812 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000813 for (upos = 0; upos<usize;)
814 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000815 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000816 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000817 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000818 ++callresult;
819 break;
820 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000821 case 'p':
822 sprintf(buffer, "%p", va_arg(vargs, void*));
823 /* %p is ill-defined: ensure leading 0x. */
824 if (buffer[1] == 'X')
825 buffer[1] = 'x';
826 else if (buffer[1] != 'x') {
827 memmove(buffer+2, buffer, strlen(buffer)+1);
828 buffer[0] = '0';
829 buffer[1] = 'x';
830 }
831 appendstring(buffer);
832 break;
833 case '%':
834 *s++ = '%';
835 break;
836 default:
837 appendstring(p);
838 goto end;
839 }
840 } else
841 *s++ = *f;
842 }
843
844 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000845 if (callresults)
846 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000847 if (abuffer)
848 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000849 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
850 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000851 fail:
852 if (callresults) {
853 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000854 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000855 Py_DECREF(*callresult2);
856 ++callresult2;
857 }
858 PyMem_Free(callresults);
859 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000860 if (abuffer)
861 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000862 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000863}
864
865#undef appendstring
866
867PyObject *
868PyUnicode_FromFormat(const char *format, ...)
869{
870 PyObject* ret;
871 va_list vargs;
872
873#ifdef HAVE_STDARG_PROTOTYPES
874 va_start(vargs, format);
875#else
876 va_start(vargs);
877#endif
878 ret = PyUnicode_FromFormatV(format, vargs);
879 va_end(vargs);
880 return ret;
881}
882
Martin v. Löwis18e16552006-02-15 17:27:45 +0000883Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
884 wchar_t *w,
885 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000886{
887 if (unicode == NULL) {
888 PyErr_BadInternalCall();
889 return -1;
890 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000891
892 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000894 size = PyUnicode_GET_SIZE(unicode) + 1;
895
Guido van Rossumd57fd912000-03-10 22:53:23 +0000896#ifdef HAVE_USABLE_WCHAR_T
897 memcpy(w, unicode->str, size * sizeof(wchar_t));
898#else
899 {
900 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000901 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000902 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000903 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000904 *w++ = *u++;
905 }
906#endif
907
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000908 if (size > PyUnicode_GET_SIZE(unicode))
909 return PyUnicode_GET_SIZE(unicode);
910 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000911 return size;
912}
913
914#endif
915
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000916PyObject *PyUnicode_FromOrdinal(int ordinal)
917{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000918 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000919
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000920 if (ordinal < 0 || ordinal > 0x10ffff) {
921 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000922 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000923 return NULL;
924 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000925
926#ifndef Py_UNICODE_WIDE
927 if (ordinal > 0xffff) {
928 ordinal -= 0x10000;
929 s[0] = 0xD800 | (ordinal >> 10);
930 s[1] = 0xDC00 | (ordinal & 0x3FF);
931 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000932 }
933#endif
934
Hye-Shik Chang40574832004-04-06 07:24:51 +0000935 s[0] = (Py_UNICODE)ordinal;
936 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000937}
938
Guido van Rossumd57fd912000-03-10 22:53:23 +0000939PyObject *PyUnicode_FromObject(register PyObject *obj)
940{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000941 /* XXX Perhaps we should make this API an alias of
942 PyObject_Unicode() instead ?! */
943 if (PyUnicode_CheckExact(obj)) {
944 Py_INCREF(obj);
945 return obj;
946 }
947 if (PyUnicode_Check(obj)) {
948 /* For a Unicode subtype that's not a Unicode object,
949 return a true Unicode object with the same data. */
950 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
951 PyUnicode_GET_SIZE(obj));
952 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000953 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
954}
955
956PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
957 const char *encoding,
958 const char *errors)
959{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000960 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000961 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000962 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000963
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964 if (obj == NULL) {
965 PyErr_BadInternalCall();
966 return NULL;
967 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000968
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000969#if 0
970 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000971 that no encodings is given and then redirect to
972 PyObject_Unicode() which then applies the additional logic for
973 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000974
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000975 NOTE: This API should really only be used for object which
976 represent *encoded* Unicode !
977
978 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000979 if (PyUnicode_Check(obj)) {
980 if (encoding) {
981 PyErr_SetString(PyExc_TypeError,
982 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000983 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000984 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000985 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000986 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000987#else
988 if (PyUnicode_Check(obj)) {
989 PyErr_SetString(PyExc_TypeError,
990 "decoding Unicode is not supported");
991 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000992 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000993#endif
994
995 /* Coerce object */
996 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000997 s = PyString_AS_STRING(obj);
998 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000999 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001000 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1001 /* Overwrite the error message with something more useful in
1002 case of a TypeError. */
1003 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001004 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001005 "coercing to Unicode: need string or buffer, "
1006 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001007 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001008 goto onError;
1009 }
Tim Petersced69f82003-09-16 20:30:58 +00001010
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001011 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001012 if (len == 0) {
1013 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001014 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015 }
Tim Petersced69f82003-09-16 20:30:58 +00001016 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001017 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001018
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001019 return v;
1020
1021 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001022 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023}
1024
1025PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001026 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001027 const char *encoding,
1028 const char *errors)
1029{
1030 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001031
1032 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001033 encoding = PyUnicode_GetDefaultEncoding();
1034
1035 /* Shortcuts for common default encodings */
1036 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001037 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001038 else if (strcmp(encoding, "latin-1") == 0)
1039 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001040#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1041 else if (strcmp(encoding, "mbcs") == 0)
1042 return PyUnicode_DecodeMBCS(s, size, errors);
1043#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001044 else if (strcmp(encoding, "ascii") == 0)
1045 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046
1047 /* Decode via the codec registry */
1048 buffer = PyBuffer_FromMemory((void *)s, size);
1049 if (buffer == NULL)
1050 goto onError;
1051 unicode = PyCodec_Decode(buffer, encoding, errors);
1052 if (unicode == NULL)
1053 goto onError;
1054 if (!PyUnicode_Check(unicode)) {
1055 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001056 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001057 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001058 Py_DECREF(unicode);
1059 goto onError;
1060 }
1061 Py_DECREF(buffer);
1062 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001063
Guido van Rossumd57fd912000-03-10 22:53:23 +00001064 onError:
1065 Py_XDECREF(buffer);
1066 return NULL;
1067}
1068
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001069PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1070 const char *encoding,
1071 const char *errors)
1072{
1073 PyObject *v;
1074
1075 if (!PyUnicode_Check(unicode)) {
1076 PyErr_BadArgument();
1077 goto onError;
1078 }
1079
1080 if (encoding == NULL)
1081 encoding = PyUnicode_GetDefaultEncoding();
1082
1083 /* Decode via the codec registry */
1084 v = PyCodec_Decode(unicode, encoding, errors);
1085 if (v == NULL)
1086 goto onError;
1087 return v;
1088
1089 onError:
1090 return NULL;
1091}
1092
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001094 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 const char *encoding,
1096 const char *errors)
1097{
1098 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001099
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 unicode = PyUnicode_FromUnicode(s, size);
1101 if (unicode == NULL)
1102 return NULL;
1103 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1104 Py_DECREF(unicode);
1105 return v;
1106}
1107
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001108PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1109 const char *encoding,
1110 const char *errors)
1111{
1112 PyObject *v;
1113
1114 if (!PyUnicode_Check(unicode)) {
1115 PyErr_BadArgument();
1116 goto onError;
1117 }
1118
1119 if (encoding == NULL)
1120 encoding = PyUnicode_GetDefaultEncoding();
1121
1122 /* Encode via the codec registry */
1123 v = PyCodec_Encode(unicode, encoding, errors);
1124 if (v == NULL)
1125 goto onError;
1126 return v;
1127
1128 onError:
1129 return NULL;
1130}
1131
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1133 const char *encoding,
1134 const char *errors)
1135{
1136 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001137
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 if (!PyUnicode_Check(unicode)) {
1139 PyErr_BadArgument();
1140 goto onError;
1141 }
Fred Drakee4315f52000-05-09 19:53:39 +00001142
Tim Petersced69f82003-09-16 20:30:58 +00001143 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001144 encoding = PyUnicode_GetDefaultEncoding();
1145
1146 /* Shortcuts for common default encodings */
1147 if (errors == NULL) {
1148 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001149 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001150 else if (strcmp(encoding, "latin-1") == 0)
1151 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001152#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1153 else if (strcmp(encoding, "mbcs") == 0)
1154 return PyUnicode_AsMBCSString(unicode);
1155#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001156 else if (strcmp(encoding, "ascii") == 0)
1157 return PyUnicode_AsASCIIString(unicode);
1158 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159
1160 /* Encode via the codec registry */
1161 v = PyCodec_Encode(unicode, encoding, errors);
1162 if (v == NULL)
1163 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001164 if (!PyBytes_Check(v)) {
1165 if (PyString_Check(v)) {
1166 /* Old codec, turn it into bytes */
1167 PyObject *b = PyBytes_FromObject(v);
1168 Py_DECREF(v);
1169 return b;
1170 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001172 "encoder did not return a bytes object "
1173 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1174 v->ob_type->tp_name,
1175 encoding ? encoding : "NULL",
1176 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 Py_DECREF(v);
1178 goto onError;
1179 }
1180 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001181
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182 onError:
1183 return NULL;
1184}
1185
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001186PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1187 const char *errors)
1188{
1189 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001190 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001191 if (v)
1192 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001193 if (errors != NULL)
1194 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1195 if (errors == NULL) {
1196 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1197 PyUnicode_GET_SIZE(unicode),
1198 NULL);
1199 }
1200 else {
1201 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1202 }
1203 if (!b)
1204 return NULL;
1205 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1206 PyBytes_Size(b));
1207 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001208 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001209 return v;
1210}
1211
Martin v. Löwis5b222132007-06-10 09:51:05 +00001212char*
1213PyUnicode_AsString(PyObject *unicode)
1214{
1215 assert(PyUnicode_Check(unicode));
1216 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1217 if (!unicode)
1218 return NULL;
1219 return PyString_AsString(unicode);
1220}
1221
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1223{
1224 if (!PyUnicode_Check(unicode)) {
1225 PyErr_BadArgument();
1226 goto onError;
1227 }
1228 return PyUnicode_AS_UNICODE(unicode);
1229
1230 onError:
1231 return NULL;
1232}
1233
Martin v. Löwis18e16552006-02-15 17:27:45 +00001234Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235{
1236 if (!PyUnicode_Check(unicode)) {
1237 PyErr_BadArgument();
1238 goto onError;
1239 }
1240 return PyUnicode_GET_SIZE(unicode);
1241
1242 onError:
1243 return -1;
1244}
1245
Thomas Wouters78890102000-07-22 19:25:51 +00001246const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001247{
1248 return unicode_default_encoding;
1249}
1250
1251int PyUnicode_SetDefaultEncoding(const char *encoding)
1252{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001253 if (strcmp(encoding, unicode_default_encoding) != 0) {
1254 PyErr_Format(PyExc_ValueError,
1255 "Can only set default encoding to %s",
1256 unicode_default_encoding);
1257 return -1;
1258 }
Fred Drakee4315f52000-05-09 19:53:39 +00001259 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001260}
1261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001262/* error handling callback helper:
1263 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001264 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001265 and adjust various state variables.
1266 return 0 on success, -1 on error
1267*/
1268
1269static
1270int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1271 const char *encoding, const char *reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001272 const char **input, const char **inend, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001273 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001274{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001275 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001276
1277 PyObject *restuple = NULL;
1278 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001279 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001280 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001281 Py_ssize_t requiredsize;
1282 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001283 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001284 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001285 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 int res = -1;
1287
1288 if (*errorHandler == NULL) {
1289 *errorHandler = PyCodec_LookupError(errors);
1290 if (*errorHandler == NULL)
1291 goto onError;
1292 }
1293
1294 if (*exceptionObject == NULL) {
1295 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001296 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001297 if (*exceptionObject == NULL)
1298 goto onError;
1299 }
1300 else {
1301 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1302 goto onError;
1303 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1304 goto onError;
1305 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1306 goto onError;
1307 }
1308
1309 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1310 if (restuple == NULL)
1311 goto onError;
1312 if (!PyTuple_Check(restuple)) {
1313 PyErr_Format(PyExc_TypeError, &argparse[4]);
1314 goto onError;
1315 }
1316 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1317 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001318
1319 /* Copy back the bytes variables, which might have been modified by the
1320 callback */
1321 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1322 if (!inputobj)
1323 goto onError;
1324 if (!PyBytes_Check(inputobj)) {
1325 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1326 }
1327 *input = PyBytes_AS_STRING(inputobj);
1328 insize = PyBytes_GET_SIZE(inputobj);
1329 *inend = *input + insize;
1330
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001332 newpos = insize+newpos;
1333 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001334 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001335 goto onError;
1336 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337
1338 /* need more space? (at least enough for what we
1339 have+the replacement+the rest of the string (starting
1340 at the new input position), so we won't have to check space
1341 when there are no errors in the rest of the string) */
1342 repptr = PyUnicode_AS_UNICODE(repunicode);
1343 repsize = PyUnicode_GET_SIZE(repunicode);
1344 requiredsize = *outpos + repsize + insize-newpos;
1345 if (requiredsize > outsize) {
1346 if (requiredsize<2*outsize)
1347 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001348 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001349 goto onError;
1350 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1351 }
1352 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001353 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001354 Py_UNICODE_COPY(*outptr, repptr, repsize);
1355 *outptr += repsize;
1356 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001357
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001358 /* we made it! */
1359 res = 0;
1360
1361 onError:
1362 Py_XDECREF(restuple);
1363 return res;
1364}
1365
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001366/* --- UTF-7 Codec -------------------------------------------------------- */
1367
1368/* see RFC2152 for details */
1369
Tim Petersced69f82003-09-16 20:30:58 +00001370static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001371char utf7_special[128] = {
1372 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1373 encoded:
1374 0 - not special
1375 1 - special
1376 2 - whitespace (optional)
1377 3 - RFC2152 Set O (optional) */
1378 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1379 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1380 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1381 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1382 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1383 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1384 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1385 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1386
1387};
1388
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001389/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1390 warnings about the comparison always being false; since
1391 utf7_special[0] is 1, we can safely make that one comparison
1392 true */
1393
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001394#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001395 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001396 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001397 (encodeO && (utf7_special[(c)] == 3)))
1398
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001399#define B64(n) \
1400 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1401#define B64CHAR(c) \
1402 (isalnum(c) || (c) == '+' || (c) == '/')
1403#define UB64(c) \
1404 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1405 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001406
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001407#define ENCODE(out, ch, bits) \
1408 while (bits >= 6) { \
1409 *out++ = B64(ch >> (bits-6)); \
1410 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001411 }
1412
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001413#define DECODE(out, ch, bits, surrogate) \
1414 while (bits >= 16) { \
1415 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1416 bits -= 16; \
1417 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001418 /* We have already generated an error for the high surrogate \
1419 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001420 surrogate = 0; \
1421 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001422 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001423 it in a 16-bit character */ \
1424 surrogate = 1; \
1425 errmsg = "code pairs are not supported"; \
1426 goto utf7Error; \
1427 } else { \
1428 *out++ = outCh; \
1429 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001430 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001431
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001432PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001433 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001434 const char *errors)
1435{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001436 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001437 Py_ssize_t startinpos;
1438 Py_ssize_t endinpos;
1439 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001440 const char *e;
1441 PyUnicodeObject *unicode;
1442 Py_UNICODE *p;
1443 const char *errmsg = "";
1444 int inShift = 0;
1445 unsigned int bitsleft = 0;
1446 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001447 int surrogate = 0;
1448 PyObject *errorHandler = NULL;
1449 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001450
1451 unicode = _PyUnicode_New(size);
1452 if (!unicode)
1453 return NULL;
1454 if (size == 0)
1455 return (PyObject *)unicode;
1456
1457 p = unicode->str;
1458 e = s + size;
1459
1460 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001461 Py_UNICODE ch;
1462 restart:
1463 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001464
1465 if (inShift) {
1466 if ((ch == '-') || !B64CHAR(ch)) {
1467 inShift = 0;
1468 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001469
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001470 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1471 if (bitsleft >= 6) {
1472 /* The shift sequence has a partial character in it. If
1473 bitsleft < 6 then we could just classify it as padding
1474 but that is not the case here */
1475
1476 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001477 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001478 }
1479 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001480 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001481 here so indicate the potential of a misencoded character. */
1482
1483 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1484 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1485 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001486 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001487 }
1488
1489 if (ch == '-') {
1490 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001491 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001492 inShift = 1;
1493 }
1494 } else if (SPECIAL(ch,0,0)) {
1495 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001496 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001497 } else {
1498 *p++ = ch;
1499 }
1500 } else {
1501 charsleft = (charsleft << 6) | UB64(ch);
1502 bitsleft += 6;
1503 s++;
1504 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1505 }
1506 }
1507 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001508 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001509 s++;
1510 if (s < e && *s == '-') {
1511 s++;
1512 *p++ = '+';
1513 } else
1514 {
1515 inShift = 1;
1516 bitsleft = 0;
1517 }
1518 }
1519 else if (SPECIAL(ch,0,0)) {
1520 errmsg = "unexpected special character";
1521 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001522 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001523 }
1524 else {
1525 *p++ = ch;
1526 s++;
1527 }
1528 continue;
1529 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001530 outpos = p-PyUnicode_AS_UNICODE(unicode);
1531 endinpos = s-starts;
1532 if (unicode_decode_call_errorhandler(
1533 errors, &errorHandler,
1534 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001535 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001536 (PyObject **)&unicode, &outpos, &p))
1537 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001538 }
1539
1540 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001541 outpos = p-PyUnicode_AS_UNICODE(unicode);
1542 endinpos = size;
1543 if (unicode_decode_call_errorhandler(
1544 errors, &errorHandler,
1545 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001546 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001547 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001548 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001549 if (s < e)
1550 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001551 }
1552
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001553 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001554 goto onError;
1555
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001556 Py_XDECREF(errorHandler);
1557 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001558 return (PyObject *)unicode;
1559
1560onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001561 Py_XDECREF(errorHandler);
1562 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563 Py_DECREF(unicode);
1564 return NULL;
1565}
1566
1567
1568PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001569 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001570 int encodeSetO,
1571 int encodeWhiteSpace,
1572 const char *errors)
1573{
1574 PyObject *v;
1575 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001576 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001577 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001578 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001579 unsigned int bitsleft = 0;
1580 unsigned long charsleft = 0;
1581 char * out;
1582 char * start;
1583
1584 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001585 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001586
Walter Dörwald51ab4142007-05-05 14:43:36 +00001587 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001588 if (v == NULL)
1589 return NULL;
1590
Walter Dörwald51ab4142007-05-05 14:43:36 +00001591 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001592 for (;i < size; ++i) {
1593 Py_UNICODE ch = s[i];
1594
1595 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001596 if (ch == '+') {
1597 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001598 *out++ = '-';
1599 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1600 charsleft = ch;
1601 bitsleft = 16;
1602 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001603 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001604 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001605 } else {
1606 *out++ = (char) ch;
1607 }
1608 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001609 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1610 *out++ = B64(charsleft << (6-bitsleft));
1611 charsleft = 0;
1612 bitsleft = 0;
1613 /* Characters not in the BASE64 set implicitly unshift the sequence
1614 so no '-' is required, except if the character is itself a '-' */
1615 if (B64CHAR(ch) || ch == '-') {
1616 *out++ = '-';
1617 }
1618 inShift = 0;
1619 *out++ = (char) ch;
1620 } else {
1621 bitsleft += 16;
1622 charsleft = (charsleft << 16) | ch;
1623 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1624
1625 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001626 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001627 or '-' then the shift sequence will be terminated implicitly and we
1628 don't have to insert a '-'. */
1629
1630 if (bitsleft == 0) {
1631 if (i + 1 < size) {
1632 Py_UNICODE ch2 = s[i+1];
1633
1634 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001635
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001636 } else if (B64CHAR(ch2) || ch2 == '-') {
1637 *out++ = '-';
1638 inShift = 0;
1639 } else {
1640 inShift = 0;
1641 }
1642
1643 }
1644 else {
1645 *out++ = '-';
1646 inShift = 0;
1647 }
1648 }
Tim Petersced69f82003-09-16 20:30:58 +00001649 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001651 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652 if (bitsleft) {
1653 *out++= B64(charsleft << (6-bitsleft) );
1654 *out++ = '-';
1655 }
1656
Walter Dörwald51ab4142007-05-05 14:43:36 +00001657 if (PyBytes_Resize(v, out - start)) {
1658 Py_DECREF(v);
1659 return NULL;
1660 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001661 return v;
1662}
1663
1664#undef SPECIAL
1665#undef B64
1666#undef B64CHAR
1667#undef UB64
1668#undef ENCODE
1669#undef DECODE
1670
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671/* --- UTF-8 Codec -------------------------------------------------------- */
1672
Tim Petersced69f82003-09-16 20:30:58 +00001673static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674char utf8_code_length[256] = {
1675 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1676 illegal prefix. see RFC 2279 for details */
1677 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1678 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1679 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1680 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1681 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1682 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1683 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1684 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1685 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1687 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1689 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1690 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1691 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1692 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1693};
1694
Guido van Rossumd57fd912000-03-10 22:53:23 +00001695PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001696 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001697 const char *errors)
1698{
Walter Dörwald69652032004-09-07 20:24:22 +00001699 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1700}
1701
1702PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001703 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001704 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001705 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001706{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001707 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001709 Py_ssize_t startinpos;
1710 Py_ssize_t endinpos;
1711 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001712 const char *e;
1713 PyUnicodeObject *unicode;
1714 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001715 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001716 PyObject *errorHandler = NULL;
1717 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718
1719 /* Note: size will always be longer than the resulting Unicode
1720 character count */
1721 unicode = _PyUnicode_New(size);
1722 if (!unicode)
1723 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001724 if (size == 0) {
1725 if (consumed)
1726 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001728 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729
1730 /* Unpack UTF-8 encoded data */
1731 p = unicode->str;
1732 e = s + size;
1733
1734 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001735 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736
1737 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001738 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 s++;
1740 continue;
1741 }
1742
1743 n = utf8_code_length[ch];
1744
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001745 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001746 if (consumed)
1747 break;
1748 else {
1749 errmsg = "unexpected end of data";
1750 startinpos = s-starts;
1751 endinpos = size;
1752 goto utf8Error;
1753 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001754 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755
1756 switch (n) {
1757
1758 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001759 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001760 startinpos = s-starts;
1761 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001762 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763
1764 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001765 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001766 startinpos = s-starts;
1767 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001768 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769
1770 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001771 if ((s[1] & 0xc0) != 0x80) {
1772 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001773 startinpos = s-starts;
1774 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001775 goto utf8Error;
1776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001778 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 startinpos = s-starts;
1780 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001781 errmsg = "illegal encoding";
1782 goto utf8Error;
1783 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001785 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 break;
1787
1788 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001789 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001790 (s[2] & 0xc0) != 0x80) {
1791 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001792 startinpos = s-starts;
1793 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001794 goto utf8Error;
1795 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001796 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001797 if (ch < 0x0800) {
1798 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001799 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001800
1801 XXX For wide builds (UCS-4) we should probably try
1802 to recombine the surrogates into a single code
1803 unit.
1804 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001805 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001806 startinpos = s-starts;
1807 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001808 goto utf8Error;
1809 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001811 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001812 break;
1813
1814 case 4:
1815 if ((s[1] & 0xc0) != 0x80 ||
1816 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001817 (s[3] & 0xc0) != 0x80) {
1818 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001819 startinpos = s-starts;
1820 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001821 goto utf8Error;
1822 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001823 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1824 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1825 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001826 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001827 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001828 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001829 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001830 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001831 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001832 startinpos = s-starts;
1833 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001834 goto utf8Error;
1835 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001836#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001837 *p++ = (Py_UNICODE)ch;
1838#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001839 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001840
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001841 /* translate from 10000..10FFFF to 0..FFFF */
1842 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001843
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001844 /* high surrogate = top 10 bits added to D800 */
1845 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001846
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001847 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001848 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001849#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001850 break;
1851
1852 default:
1853 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001854 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001855 startinpos = s-starts;
1856 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001857 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858 }
1859 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001860 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001861
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001862 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 outpos = p-PyUnicode_AS_UNICODE(unicode);
1864 if (unicode_decode_call_errorhandler(
1865 errors, &errorHandler,
1866 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001867 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001868 (PyObject **)&unicode, &outpos, &p))
1869 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001870 }
Walter Dörwald69652032004-09-07 20:24:22 +00001871 if (consumed)
1872 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873
1874 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001875 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876 goto onError;
1877
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001878 Py_XDECREF(errorHandler);
1879 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880 return (PyObject *)unicode;
1881
1882onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001883 Py_XDECREF(errorHandler);
1884 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001885 Py_DECREF(unicode);
1886 return NULL;
1887}
1888
Tim Peters602f7402002-04-27 18:03:26 +00001889/* Allocation strategy: if the string is short, convert into a stack buffer
1890 and allocate exactly as much space needed at the end. Else allocate the
1891 maximum possible needed (4 result bytes per Unicode character), and return
1892 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001893*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001894PyObject *
1895PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001896 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001897 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898{
Tim Peters602f7402002-04-27 18:03:26 +00001899#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001900
Martin v. Löwis18e16552006-02-15 17:27:45 +00001901 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001902 PyObject *v; /* result string object */
1903 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001904 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001905 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001906 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001907
Tim Peters602f7402002-04-27 18:03:26 +00001908 assert(s != NULL);
1909 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001910
Tim Peters602f7402002-04-27 18:03:26 +00001911 if (size <= MAX_SHORT_UNICHARS) {
1912 /* Write into the stack buffer; nallocated can't overflow.
1913 * At the end, we'll allocate exactly as much heap space as it
1914 * turns out we need.
1915 */
1916 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1917 v = NULL; /* will allocate after we're done */
1918 p = stackbuf;
1919 }
1920 else {
1921 /* Overallocate on the heap, and give the excess back at the end. */
1922 nallocated = size * 4;
1923 if (nallocated / 4 != size) /* overflow! */
1924 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001925 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001926 if (v == NULL)
1927 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001928 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001929 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001930
Tim Peters602f7402002-04-27 18:03:26 +00001931 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001932 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001933
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001934 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001935 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001936 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001937
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001939 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001940 *p++ = (char)(0xc0 | (ch >> 6));
1941 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001942 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001943 else {
Tim Peters602f7402002-04-27 18:03:26 +00001944 /* Encode UCS2 Unicode ordinals */
1945 if (ch < 0x10000) {
1946 /* Special case: check for high surrogate */
1947 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1948 Py_UCS4 ch2 = s[i];
1949 /* Check for low surrogate and combine the two to
1950 form a UCS4 value */
1951 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001952 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001953 i++;
1954 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001955 }
Tim Peters602f7402002-04-27 18:03:26 +00001956 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001957 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001958 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001959 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1960 *p++ = (char)(0x80 | (ch & 0x3f));
1961 continue;
1962 }
1963encodeUCS4:
1964 /* Encode UCS4 Unicode ordinals */
1965 *p++ = (char)(0xf0 | (ch >> 18));
1966 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1967 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1968 *p++ = (char)(0x80 | (ch & 0x3f));
1969 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001971
Tim Peters602f7402002-04-27 18:03:26 +00001972 if (v == NULL) {
1973 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001974 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001975 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001976 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001977 }
1978 else {
1979 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001980 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001981 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001982 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001985
Tim Peters602f7402002-04-27 18:03:26 +00001986#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987}
1988
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1990{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001991 if (!PyUnicode_Check(unicode)) {
1992 PyErr_BadArgument();
1993 return NULL;
1994 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001995 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1996 PyUnicode_GET_SIZE(unicode),
1997 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998}
1999
2000/* --- UTF-16 Codec ------------------------------------------------------- */
2001
Tim Peters772747b2001-08-09 22:21:55 +00002002PyObject *
2003PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002004 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002005 const char *errors,
2006 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002007{
Walter Dörwald69652032004-09-07 20:24:22 +00002008 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2009}
2010
2011PyObject *
2012PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002013 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002014 const char *errors,
2015 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002016 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002017{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002018 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002019 Py_ssize_t startinpos;
2020 Py_ssize_t endinpos;
2021 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022 PyUnicodeObject *unicode;
2023 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002024 const unsigned char *q, *e;
2025 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002026 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002027 /* Offsets from q for retrieving byte pairs in the right order. */
2028#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2029 int ihi = 1, ilo = 0;
2030#else
2031 int ihi = 0, ilo = 1;
2032#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002033 PyObject *errorHandler = NULL;
2034 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035
2036 /* Note: size will always be longer than the resulting Unicode
2037 character count */
2038 unicode = _PyUnicode_New(size);
2039 if (!unicode)
2040 return NULL;
2041 if (size == 0)
2042 return (PyObject *)unicode;
2043
2044 /* Unpack UTF-16 encoded data */
2045 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002046 q = (unsigned char *)s;
2047 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048
2049 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002050 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002052 /* Check for BOM marks (U+FEFF) in the input and adjust current
2053 byte order setting accordingly. In native mode, the leading BOM
2054 mark is skipped, in all other modes, it is copied to the output
2055 stream as-is (giving a ZWNBSP character). */
2056 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002057 if (size >= 2) {
2058 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002059#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002060 if (bom == 0xFEFF) {
2061 q += 2;
2062 bo = -1;
2063 }
2064 else if (bom == 0xFFFE) {
2065 q += 2;
2066 bo = 1;
2067 }
Tim Petersced69f82003-09-16 20:30:58 +00002068#else
Walter Dörwald69652032004-09-07 20:24:22 +00002069 if (bom == 0xFEFF) {
2070 q += 2;
2071 bo = 1;
2072 }
2073 else if (bom == 0xFFFE) {
2074 q += 2;
2075 bo = -1;
2076 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002077#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002078 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002079 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080
Tim Peters772747b2001-08-09 22:21:55 +00002081 if (bo == -1) {
2082 /* force LE */
2083 ihi = 1;
2084 ilo = 0;
2085 }
2086 else if (bo == 1) {
2087 /* force BE */
2088 ihi = 0;
2089 ilo = 1;
2090 }
2091
2092 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002093 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002094 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002095 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002096 if (consumed)
2097 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002098 errmsg = "truncated data";
2099 startinpos = ((const char *)q)-starts;
2100 endinpos = ((const char *)e)-starts;
2101 goto utf16Error;
2102 /* The remaining input chars are ignored if the callback
2103 chooses to skip the input */
2104 }
2105 ch = (q[ihi] << 8) | q[ilo];
2106
Tim Peters772747b2001-08-09 22:21:55 +00002107 q += 2;
2108
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109 if (ch < 0xD800 || ch > 0xDFFF) {
2110 *p++ = ch;
2111 continue;
2112 }
2113
2114 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002115 if (q >= e) {
2116 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002117 startinpos = (((const char *)q)-2)-starts;
2118 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002119 goto utf16Error;
2120 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002121 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002122 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2123 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002124 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002125#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002126 *p++ = ch;
2127 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002128#else
2129 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002130#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002131 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002132 }
2133 else {
2134 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002135 startinpos = (((const char *)q)-4)-starts;
2136 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002137 goto utf16Error;
2138 }
2139
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002141 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002142 startinpos = (((const char *)q)-2)-starts;
2143 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002144 /* Fall through to report the error */
2145
2146 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002147 outpos = p-PyUnicode_AS_UNICODE(unicode);
2148 if (unicode_decode_call_errorhandler(
2149 errors, &errorHandler,
2150 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002151 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002152 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002153 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154 }
2155
2156 if (byteorder)
2157 *byteorder = bo;
2158
Walter Dörwald69652032004-09-07 20:24:22 +00002159 if (consumed)
2160 *consumed = (const char *)q-starts;
2161
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002163 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002164 goto onError;
2165
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002166 Py_XDECREF(errorHandler);
2167 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168 return (PyObject *)unicode;
2169
2170onError:
2171 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002172 Py_XDECREF(errorHandler);
2173 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174 return NULL;
2175}
2176
Tim Peters772747b2001-08-09 22:21:55 +00002177PyObject *
2178PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002179 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002180 const char *errors,
2181 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182{
2183 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002184 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002185#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002186 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002187#else
2188 const int pairs = 0;
2189#endif
Tim Peters772747b2001-08-09 22:21:55 +00002190 /* Offsets from p for storing byte pairs in the right order. */
2191#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2192 int ihi = 1, ilo = 0;
2193#else
2194 int ihi = 0, ilo = 1;
2195#endif
2196
2197#define STORECHAR(CH) \
2198 do { \
2199 p[ihi] = ((CH) >> 8) & 0xff; \
2200 p[ilo] = (CH) & 0xff; \
2201 p += 2; \
2202 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002204#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002205 for (i = pairs = 0; i < size; i++)
2206 if (s[i] >= 0x10000)
2207 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002208#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002209 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002210 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 if (v == NULL)
2212 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213
Walter Dörwald3cc34522007-05-04 10:48:27 +00002214 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002216 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002217 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002218 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002219
2220 if (byteorder == -1) {
2221 /* force LE */
2222 ihi = 1;
2223 ilo = 0;
2224 }
2225 else if (byteorder == 1) {
2226 /* force BE */
2227 ihi = 0;
2228 ilo = 1;
2229 }
2230
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002231 while (size-- > 0) {
2232 Py_UNICODE ch = *s++;
2233 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002234#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002235 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002236 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2237 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002239#endif
Tim Peters772747b2001-08-09 22:21:55 +00002240 STORECHAR(ch);
2241 if (ch2)
2242 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002245#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246}
2247
2248PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2249{
2250 if (!PyUnicode_Check(unicode)) {
2251 PyErr_BadArgument();
2252 return NULL;
2253 }
2254 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2255 PyUnicode_GET_SIZE(unicode),
2256 NULL,
2257 0);
2258}
2259
2260/* --- Unicode Escape Codec ----------------------------------------------- */
2261
Fredrik Lundh06d12682001-01-24 07:59:11 +00002262static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002263
Guido van Rossumd57fd912000-03-10 22:53:23 +00002264PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002265 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002266 const char *errors)
2267{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002268 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002269 Py_ssize_t startinpos;
2270 Py_ssize_t endinpos;
2271 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002272 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002274 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002275 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002276 char* message;
2277 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002278 PyObject *errorHandler = NULL;
2279 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002280
Guido van Rossumd57fd912000-03-10 22:53:23 +00002281 /* Escaped strings will always be longer than the resulting
2282 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002283 length after conversion to the true value.
2284 (but if the error callback returns a long replacement string
2285 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002286 v = _PyUnicode_New(size);
2287 if (v == NULL)
2288 goto onError;
2289 if (size == 0)
2290 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002291
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002292 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002293 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002294
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295 while (s < end) {
2296 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002297 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002298 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299
2300 /* Non-escape characters are interpreted as Unicode ordinals */
2301 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002302 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 continue;
2304 }
2305
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002306 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307 /* \ - Escapes */
2308 s++;
2309 switch (*s++) {
2310
2311 /* \x escapes */
2312 case '\n': break;
2313 case '\\': *p++ = '\\'; break;
2314 case '\'': *p++ = '\''; break;
2315 case '\"': *p++ = '\"'; break;
2316 case 'b': *p++ = '\b'; break;
2317 case 'f': *p++ = '\014'; break; /* FF */
2318 case 't': *p++ = '\t'; break;
2319 case 'n': *p++ = '\n'; break;
2320 case 'r': *p++ = '\r'; break;
2321 case 'v': *p++ = '\013'; break; /* VT */
2322 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2323
2324 /* \OOO (octal) escapes */
2325 case '0': case '1': case '2': case '3':
2326 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002327 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002328 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002329 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002331 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002332 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002333 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334 break;
2335
Fredrik Lundhccc74732001-02-18 22:13:49 +00002336 /* hex escapes */
2337 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002338 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002339 digits = 2;
2340 message = "truncated \\xXX escape";
2341 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002342
Fredrik Lundhccc74732001-02-18 22:13:49 +00002343 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002344 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002345 digits = 4;
2346 message = "truncated \\uXXXX escape";
2347 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002348
Fredrik Lundhccc74732001-02-18 22:13:49 +00002349 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002350 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002351 digits = 8;
2352 message = "truncated \\UXXXXXXXX escape";
2353 hexescape:
2354 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002355 outpos = p-PyUnicode_AS_UNICODE(v);
2356 if (s+digits>end) {
2357 endinpos = size;
2358 if (unicode_decode_call_errorhandler(
2359 errors, &errorHandler,
2360 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002361 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002362 (PyObject **)&v, &outpos, &p))
2363 goto onError;
2364 goto nextByte;
2365 }
2366 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002367 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002368 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002369 endinpos = (s+i+1)-starts;
2370 if (unicode_decode_call_errorhandler(
2371 errors, &errorHandler,
2372 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002373 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002374 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002375 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002376 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002377 }
2378 chr = (chr<<4) & ~0xF;
2379 if (c >= '0' && c <= '9')
2380 chr += c - '0';
2381 else if (c >= 'a' && c <= 'f')
2382 chr += 10 + c - 'a';
2383 else
2384 chr += 10 + c - 'A';
2385 }
2386 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002387 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002388 /* _decoding_error will have already written into the
2389 target buffer. */
2390 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002391 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002392 /* when we get here, chr is a 32-bit unicode character */
2393 if (chr <= 0xffff)
2394 /* UCS-2 character */
2395 *p++ = (Py_UNICODE) chr;
2396 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002397 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002398 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002399#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002400 *p++ = chr;
2401#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002402 chr -= 0x10000L;
2403 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002404 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002405#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002406 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002407 endinpos = s-starts;
2408 outpos = p-PyUnicode_AS_UNICODE(v);
2409 if (unicode_decode_call_errorhandler(
2410 errors, &errorHandler,
2411 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002412 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002413 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002414 goto onError;
2415 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002416 break;
2417
2418 /* \N{name} */
2419 case 'N':
2420 message = "malformed \\N character escape";
2421 if (ucnhash_CAPI == NULL) {
2422 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002423 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002424 m = PyImport_ImportModule("unicodedata");
2425 if (m == NULL)
2426 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002427 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002428 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002429 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002430 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002431 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002432 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002433 if (ucnhash_CAPI == NULL)
2434 goto ucnhashError;
2435 }
2436 if (*s == '{') {
2437 const char *start = s+1;
2438 /* look for the closing brace */
2439 while (*s != '}' && s < end)
2440 s++;
2441 if (s > start && s < end && *s == '}') {
2442 /* found a name. look it up in the unicode database */
2443 message = "unknown Unicode character name";
2444 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002445 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002446 goto store;
2447 }
2448 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002449 endinpos = s-starts;
2450 outpos = p-PyUnicode_AS_UNICODE(v);
2451 if (unicode_decode_call_errorhandler(
2452 errors, &errorHandler,
2453 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002454 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002455 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002456 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002457 break;
2458
2459 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002460 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002461 message = "\\ at end of string";
2462 s--;
2463 endinpos = s-starts;
2464 outpos = p-PyUnicode_AS_UNICODE(v);
2465 if (unicode_decode_call_errorhandler(
2466 errors, &errorHandler,
2467 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002468 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002469 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002470 goto onError;
2471 }
2472 else {
2473 *p++ = '\\';
2474 *p++ = (unsigned char)s[-1];
2475 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002476 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002477 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002478 nextByte:
2479 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002481 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002482 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002483 Py_XDECREF(errorHandler);
2484 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002486
Fredrik Lundhccc74732001-02-18 22:13:49 +00002487ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002488 PyErr_SetString(
2489 PyExc_UnicodeError,
2490 "\\N escapes not supported (can't load unicodedata module)"
2491 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002492 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002493 Py_XDECREF(errorHandler);
2494 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002495 return NULL;
2496
Fredrik Lundhccc74732001-02-18 22:13:49 +00002497onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002499 Py_XDECREF(errorHandler);
2500 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002501 return NULL;
2502}
2503
2504/* Return a Unicode-Escape string version of the Unicode object.
2505
2506 If quotes is true, the string is enclosed in u"" or u'' quotes as
2507 appropriate.
2508
2509*/
2510
Thomas Wouters477c8d52006-05-27 19:21:47 +00002511Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2512 Py_ssize_t size,
2513 Py_UNICODE ch)
2514{
2515 /* like wcschr, but doesn't stop at NULL characters */
2516
2517 while (size-- > 0) {
2518 if (*s == ch)
2519 return s;
2520 s++;
2521 }
2522
2523 return NULL;
2524}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002525
Walter Dörwald79e913e2007-05-12 11:08:06 +00002526static const char *hexdigits = "0123456789abcdef";
2527
2528PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2529 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530{
2531 PyObject *repr;
2532 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002533
Thomas Wouters89f507f2006-12-13 04:49:30 +00002534 /* XXX(nnorwitz): rather than over-allocating, it would be
2535 better to choose a different scheme. Perhaps scan the
2536 first N-chars of the string and allocate based on that size.
2537 */
2538 /* Initial allocation is based on the longest-possible unichr
2539 escape.
2540
2541 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2542 unichr, so in this case it's the longest unichr escape. In
2543 narrow (UTF-16) builds this is five chars per source unichr
2544 since there are two unichrs in the surrogate pair, so in narrow
2545 (UTF-16) builds it's not the longest unichr escape.
2546
2547 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2548 so in the narrow (UTF-16) build case it's the longest unichr
2549 escape.
2550 */
2551
Walter Dörwald79e913e2007-05-12 11:08:06 +00002552 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002553#ifdef Py_UNICODE_WIDE
2554 + 10*size
2555#else
2556 + 6*size
2557#endif
2558 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 if (repr == NULL)
2560 return NULL;
2561
Walter Dörwald79e913e2007-05-12 11:08:06 +00002562 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563
Guido van Rossumd57fd912000-03-10 22:53:23 +00002564 while (size-- > 0) {
2565 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002566
Walter Dörwald79e913e2007-05-12 11:08:06 +00002567 /* Escape backslashes */
2568 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002569 *p++ = '\\';
2570 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002571 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002572 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002573
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002574#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002575 /* Map 21-bit characters to '\U00xxxxxx' */
2576 else if (ch >= 0x10000) {
2577 *p++ = '\\';
2578 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002579 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2580 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2581 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2582 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2583 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2584 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2585 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2586 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002587 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002588 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002589#else
2590 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002591 else if (ch >= 0xD800 && ch < 0xDC00) {
2592 Py_UNICODE ch2;
2593 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002594
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002595 ch2 = *s++;
2596 size--;
2597 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2598 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2599 *p++ = '\\';
2600 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002601 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2602 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2603 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2604 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2605 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2606 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2607 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2608 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002609 continue;
2610 }
2611 /* Fall through: isolated surrogates are copied as-is */
2612 s--;
2613 size++;
2614 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002615#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002616
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002618 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619 *p++ = '\\';
2620 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002621 *p++ = hexdigits[(ch >> 12) & 0x000F];
2622 *p++ = hexdigits[(ch >> 8) & 0x000F];
2623 *p++ = hexdigits[(ch >> 4) & 0x000F];
2624 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002626
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002627 /* Map special whitespace to '\t', \n', '\r' */
2628 else if (ch == '\t') {
2629 *p++ = '\\';
2630 *p++ = 't';
2631 }
2632 else if (ch == '\n') {
2633 *p++ = '\\';
2634 *p++ = 'n';
2635 }
2636 else if (ch == '\r') {
2637 *p++ = '\\';
2638 *p++ = 'r';
2639 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002640
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002641 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002642 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002644 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002645 *p++ = hexdigits[(ch >> 4) & 0x000F];
2646 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002647 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002648
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649 /* Copy everything else as-is */
2650 else
2651 *p++ = (char) ch;
2652 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002653
2654 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002655 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2656 Py_DECREF(repr);
2657 return NULL;
2658 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659 return repr;
2660}
2661
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2663{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002664 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665 if (!PyUnicode_Check(unicode)) {
2666 PyErr_BadArgument();
2667 return NULL;
2668 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002669 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2670 PyUnicode_GET_SIZE(unicode));
2671
2672 if (!s)
2673 return NULL;
2674 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2675 PyBytes_GET_SIZE(s));
2676 Py_DECREF(s);
2677 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678}
2679
2680/* --- Raw Unicode Escape Codec ------------------------------------------- */
2681
2682PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002683 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 const char *errors)
2685{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002686 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002687 Py_ssize_t startinpos;
2688 Py_ssize_t endinpos;
2689 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002691 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692 const char *end;
2693 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002694 PyObject *errorHandler = NULL;
2695 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002696
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 /* Escaped strings will always be longer than the resulting
2698 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002699 length after conversion to the true value. (But decoding error
2700 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701 v = _PyUnicode_New(size);
2702 if (v == NULL)
2703 goto onError;
2704 if (size == 0)
2705 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002706 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002707 end = s + size;
2708 while (s < end) {
2709 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002710 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002711 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002712 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002713
2714 /* Non-escape characters are interpreted as Unicode ordinals */
2715 if (*s != '\\') {
2716 *p++ = (unsigned char)*s++;
2717 continue;
2718 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002719 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720
2721 /* \u-escapes are only interpreted iff the number of leading
2722 backslashes if odd */
2723 bs = s;
2724 for (;s < end;) {
2725 if (*s != '\\')
2726 break;
2727 *p++ = (unsigned char)*s++;
2728 }
2729 if (((s - bs) & 1) == 0 ||
2730 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002731 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732 continue;
2733 }
2734 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002735 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736 s++;
2737
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002738 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002739 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002740 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002741 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002743 endinpos = s-starts;
2744 if (unicode_decode_call_errorhandler(
2745 errors, &errorHandler,
2746 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002747 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002748 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002750 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 }
2752 x = (x<<4) & ~0xF;
2753 if (c >= '0' && c <= '9')
2754 x += c - '0';
2755 else if (c >= 'a' && c <= 'f')
2756 x += 10 + c - 'a';
2757 else
2758 x += 10 + c - 'A';
2759 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002760#ifndef Py_UNICODE_WIDE
2761 if (x > 0x10000) {
2762 if (unicode_decode_call_errorhandler(
2763 errors, &errorHandler,
2764 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002765 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002766 (PyObject **)&v, &outpos, &p))
2767 goto onError;
2768 }
2769#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002770 *p++ = x;
2771 nextByte:
2772 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002774 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002775 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002776 Py_XDECREF(errorHandler);
2777 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002779
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 onError:
2781 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002782 Py_XDECREF(errorHandler);
2783 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 return NULL;
2785}
2786
2787PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002788 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789{
2790 PyObject *repr;
2791 char *p;
2792 char *q;
2793
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002794#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002795 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002796#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002797 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002798#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 if (repr == NULL)
2800 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002801 if (size == 0)
2802 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803
Walter Dörwald711005d2007-05-12 12:03:26 +00002804 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 while (size-- > 0) {
2806 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002807#ifdef Py_UNICODE_WIDE
2808 /* Map 32-bit characters to '\Uxxxxxxxx' */
2809 if (ch >= 0x10000) {
2810 *p++ = '\\';
2811 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002812 *p++ = hexdigits[(ch >> 28) & 0xf];
2813 *p++ = hexdigits[(ch >> 24) & 0xf];
2814 *p++ = hexdigits[(ch >> 20) & 0xf];
2815 *p++ = hexdigits[(ch >> 16) & 0xf];
2816 *p++ = hexdigits[(ch >> 12) & 0xf];
2817 *p++ = hexdigits[(ch >> 8) & 0xf];
2818 *p++ = hexdigits[(ch >> 4) & 0xf];
2819 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002820 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002821 else
2822#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823 /* Map 16-bit characters to '\uxxxx' */
2824 if (ch >= 256) {
2825 *p++ = '\\';
2826 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002827 *p++ = hexdigits[(ch >> 12) & 0xf];
2828 *p++ = hexdigits[(ch >> 8) & 0xf];
2829 *p++ = hexdigits[(ch >> 4) & 0xf];
2830 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831 }
2832 /* Copy everything else as-is */
2833 else
2834 *p++ = (char) ch;
2835 }
2836 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002837 if (PyBytes_Resize(repr, p - q)) {
2838 Py_DECREF(repr);
2839 return NULL;
2840 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841 return repr;
2842}
2843
2844PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2845{
Walter Dörwald711005d2007-05-12 12:03:26 +00002846 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002848 PyErr_BadArgument();
2849 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002851 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2852 PyUnicode_GET_SIZE(unicode));
2853
2854 if (!s)
2855 return NULL;
2856 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2857 PyBytes_GET_SIZE(s));
2858 Py_DECREF(s);
2859 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860}
2861
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002862/* --- Unicode Internal Codec ------------------------------------------- */
2863
2864PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002865 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002866 const char *errors)
2867{
2868 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002869 Py_ssize_t startinpos;
2870 Py_ssize_t endinpos;
2871 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002872 PyUnicodeObject *v;
2873 Py_UNICODE *p;
2874 const char *end;
2875 const char *reason;
2876 PyObject *errorHandler = NULL;
2877 PyObject *exc = NULL;
2878
Neal Norwitzd43069c2006-01-08 01:12:10 +00002879#ifdef Py_UNICODE_WIDE
2880 Py_UNICODE unimax = PyUnicode_GetMax();
2881#endif
2882
Thomas Wouters89f507f2006-12-13 04:49:30 +00002883 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002884 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2885 if (v == NULL)
2886 goto onError;
2887 if (PyUnicode_GetSize((PyObject *)v) == 0)
2888 return (PyObject *)v;
2889 p = PyUnicode_AS_UNICODE(v);
2890 end = s + size;
2891
2892 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002893 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002894 /* We have to sanity check the raw data, otherwise doom looms for
2895 some malformed UCS-4 data. */
2896 if (
2897 #ifdef Py_UNICODE_WIDE
2898 *p > unimax || *p < 0 ||
2899 #endif
2900 end-s < Py_UNICODE_SIZE
2901 )
2902 {
2903 startinpos = s - starts;
2904 if (end-s < Py_UNICODE_SIZE) {
2905 endinpos = end-starts;
2906 reason = "truncated input";
2907 }
2908 else {
2909 endinpos = s - starts + Py_UNICODE_SIZE;
2910 reason = "illegal code point (> 0x10FFFF)";
2911 }
2912 outpos = p - PyUnicode_AS_UNICODE(v);
2913 if (unicode_decode_call_errorhandler(
2914 errors, &errorHandler,
2915 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002916 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002917 (PyObject **)&v, &outpos, &p)) {
2918 goto onError;
2919 }
2920 }
2921 else {
2922 p++;
2923 s += Py_UNICODE_SIZE;
2924 }
2925 }
2926
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002927 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002928 goto onError;
2929 Py_XDECREF(errorHandler);
2930 Py_XDECREF(exc);
2931 return (PyObject *)v;
2932
2933 onError:
2934 Py_XDECREF(v);
2935 Py_XDECREF(errorHandler);
2936 Py_XDECREF(exc);
2937 return NULL;
2938}
2939
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940/* --- Latin-1 Codec ------------------------------------------------------ */
2941
2942PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002943 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002944 const char *errors)
2945{
2946 PyUnicodeObject *v;
2947 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002948
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002950 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002951 Py_UNICODE r = *(unsigned char*)s;
2952 return PyUnicode_FromUnicode(&r, 1);
2953 }
2954
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955 v = _PyUnicode_New(size);
2956 if (v == NULL)
2957 goto onError;
2958 if (size == 0)
2959 return (PyObject *)v;
2960 p = PyUnicode_AS_UNICODE(v);
2961 while (size-- > 0)
2962 *p++ = (unsigned char)*s++;
2963 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002964
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965 onError:
2966 Py_XDECREF(v);
2967 return NULL;
2968}
2969
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002970/* create or adjust a UnicodeEncodeError */
2971static void make_encode_exception(PyObject **exceptionObject,
2972 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002973 const Py_UNICODE *unicode, Py_ssize_t size,
2974 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002975 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002977 if (*exceptionObject == NULL) {
2978 *exceptionObject = PyUnicodeEncodeError_Create(
2979 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980 }
2981 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002982 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2983 goto onError;
2984 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2985 goto onError;
2986 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2987 goto onError;
2988 return;
2989 onError:
2990 Py_DECREF(*exceptionObject);
2991 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992 }
2993}
2994
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002995/* raises a UnicodeEncodeError */
2996static void raise_encode_exception(PyObject **exceptionObject,
2997 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002998 const Py_UNICODE *unicode, Py_ssize_t size,
2999 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003000 const char *reason)
3001{
3002 make_encode_exception(exceptionObject,
3003 encoding, unicode, size, startpos, endpos, reason);
3004 if (*exceptionObject != NULL)
3005 PyCodec_StrictErrors(*exceptionObject);
3006}
3007
3008/* error handling callback helper:
3009 build arguments, call the callback and check the arguments,
3010 put the result into newpos and return the replacement string, which
3011 has to be freed by the caller */
3012static PyObject *unicode_encode_call_errorhandler(const char *errors,
3013 PyObject **errorHandler,
3014 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003015 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3016 Py_ssize_t startpos, Py_ssize_t endpos,
3017 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003018{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003019 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003020
3021 PyObject *restuple;
3022 PyObject *resunicode;
3023
3024 if (*errorHandler == NULL) {
3025 *errorHandler = PyCodec_LookupError(errors);
3026 if (*errorHandler == NULL)
3027 return NULL;
3028 }
3029
3030 make_encode_exception(exceptionObject,
3031 encoding, unicode, size, startpos, endpos, reason);
3032 if (*exceptionObject == NULL)
3033 return NULL;
3034
3035 restuple = PyObject_CallFunctionObjArgs(
3036 *errorHandler, *exceptionObject, NULL);
3037 if (restuple == NULL)
3038 return NULL;
3039 if (!PyTuple_Check(restuple)) {
3040 PyErr_Format(PyExc_TypeError, &argparse[4]);
3041 Py_DECREF(restuple);
3042 return NULL;
3043 }
3044 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3045 &resunicode, newpos)) {
3046 Py_DECREF(restuple);
3047 return NULL;
3048 }
3049 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003050 *newpos = size+*newpos;
3051 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003052 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003053 Py_DECREF(restuple);
3054 return NULL;
3055 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003056 Py_INCREF(resunicode);
3057 Py_DECREF(restuple);
3058 return resunicode;
3059}
3060
3061static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003062 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003063 const char *errors,
3064 int limit)
3065{
3066 /* output object */
3067 PyObject *res;
3068 /* pointers to the beginning and end+1 of input */
3069 const Py_UNICODE *startp = p;
3070 const Py_UNICODE *endp = p + size;
3071 /* pointer to the beginning of the unencodable characters */
3072 /* const Py_UNICODE *badp = NULL; */
3073 /* pointer into the output */
3074 char *str;
3075 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003076 Py_ssize_t respos = 0;
3077 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003078 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3079 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003080 PyObject *errorHandler = NULL;
3081 PyObject *exc = NULL;
3082 /* the following variable is used for caching string comparisons
3083 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3084 int known_errorHandler = -1;
3085
3086 /* allocate enough for a simple encoding without
3087 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003088 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003089 if (res == NULL)
3090 goto onError;
3091 if (size == 0)
3092 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003093 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094 ressize = size;
3095
3096 while (p<endp) {
3097 Py_UNICODE c = *p;
3098
3099 /* can we encode this? */
3100 if (c<limit) {
3101 /* no overflow check, because we know that the space is enough */
3102 *str++ = (char)c;
3103 ++p;
3104 }
3105 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003106 Py_ssize_t unicodepos = p-startp;
3107 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003108 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003109 Py_ssize_t repsize;
3110 Py_ssize_t newpos;
3111 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003112 Py_UNICODE *uni2;
3113 /* startpos for collecting unencodable chars */
3114 const Py_UNICODE *collstart = p;
3115 const Py_UNICODE *collend = p;
3116 /* find all unecodable characters */
3117 while ((collend < endp) && ((*collend)>=limit))
3118 ++collend;
3119 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3120 if (known_errorHandler==-1) {
3121 if ((errors==NULL) || (!strcmp(errors, "strict")))
3122 known_errorHandler = 1;
3123 else if (!strcmp(errors, "replace"))
3124 known_errorHandler = 2;
3125 else if (!strcmp(errors, "ignore"))
3126 known_errorHandler = 3;
3127 else if (!strcmp(errors, "xmlcharrefreplace"))
3128 known_errorHandler = 4;
3129 else
3130 known_errorHandler = 0;
3131 }
3132 switch (known_errorHandler) {
3133 case 1: /* strict */
3134 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3135 goto onError;
3136 case 2: /* replace */
3137 while (collstart++<collend)
3138 *str++ = '?'; /* fall through */
3139 case 3: /* ignore */
3140 p = collend;
3141 break;
3142 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003143 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003144 /* determine replacement size (temporarily (mis)uses p) */
3145 for (p = collstart, repsize = 0; p < collend; ++p) {
3146 if (*p<10)
3147 repsize += 2+1+1;
3148 else if (*p<100)
3149 repsize += 2+2+1;
3150 else if (*p<1000)
3151 repsize += 2+3+1;
3152 else if (*p<10000)
3153 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003154#ifndef Py_UNICODE_WIDE
3155 else
3156 repsize += 2+5+1;
3157#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003158 else if (*p<100000)
3159 repsize += 2+5+1;
3160 else if (*p<1000000)
3161 repsize += 2+6+1;
3162 else
3163 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003164#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003165 }
3166 requiredsize = respos+repsize+(endp-collend);
3167 if (requiredsize > ressize) {
3168 if (requiredsize<2*ressize)
3169 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003170 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003171 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003172 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003173 ressize = requiredsize;
3174 }
3175 /* generate replacement (temporarily (mis)uses p) */
3176 for (p = collstart; p < collend; ++p) {
3177 str += sprintf(str, "&#%d;", (int)*p);
3178 }
3179 p = collend;
3180 break;
3181 default:
3182 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3183 encoding, reason, startp, size, &exc,
3184 collstart-startp, collend-startp, &newpos);
3185 if (repunicode == NULL)
3186 goto onError;
3187 /* need more space? (at least enough for what we
3188 have+the replacement+the rest of the string, so
3189 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003190 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003191 repsize = PyUnicode_GET_SIZE(repunicode);
3192 requiredsize = respos+repsize+(endp-collend);
3193 if (requiredsize > ressize) {
3194 if (requiredsize<2*ressize)
3195 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003196 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003197 Py_DECREF(repunicode);
3198 goto onError;
3199 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003200 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003201 ressize = requiredsize;
3202 }
3203 /* check if there is anything unencodable in the replacement
3204 and copy it to the output */
3205 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3206 c = *uni2;
3207 if (c >= limit) {
3208 raise_encode_exception(&exc, encoding, startp, size,
3209 unicodepos, unicodepos+1, reason);
3210 Py_DECREF(repunicode);
3211 goto onError;
3212 }
3213 *str = (char)c;
3214 }
3215 p = startp + newpos;
3216 Py_DECREF(repunicode);
3217 }
3218 }
3219 }
3220 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003221 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003222 if (respos<ressize)
3223 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003224 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003225 Py_XDECREF(errorHandler);
3226 Py_XDECREF(exc);
3227 return res;
3228
3229 onError:
3230 Py_XDECREF(res);
3231 Py_XDECREF(errorHandler);
3232 Py_XDECREF(exc);
3233 return NULL;
3234}
3235
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003237 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238 const char *errors)
3239{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003240 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241}
3242
3243PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3244{
3245 if (!PyUnicode_Check(unicode)) {
3246 PyErr_BadArgument();
3247 return NULL;
3248 }
3249 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3250 PyUnicode_GET_SIZE(unicode),
3251 NULL);
3252}
3253
3254/* --- 7-bit ASCII Codec -------------------------------------------------- */
3255
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003257 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 const char *errors)
3259{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003260 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 PyUnicodeObject *v;
3262 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003263 Py_ssize_t startinpos;
3264 Py_ssize_t endinpos;
3265 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003266 const char *e;
3267 PyObject *errorHandler = NULL;
3268 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003269
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003271 if (size == 1 && *(unsigned char*)s < 128) {
3272 Py_UNICODE r = *(unsigned char*)s;
3273 return PyUnicode_FromUnicode(&r, 1);
3274 }
Tim Petersced69f82003-09-16 20:30:58 +00003275
Guido van Rossumd57fd912000-03-10 22:53:23 +00003276 v = _PyUnicode_New(size);
3277 if (v == NULL)
3278 goto onError;
3279 if (size == 0)
3280 return (PyObject *)v;
3281 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003282 e = s + size;
3283 while (s < e) {
3284 register unsigned char c = (unsigned char)*s;
3285 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003287 ++s;
3288 }
3289 else {
3290 startinpos = s-starts;
3291 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003292 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003293 if (unicode_decode_call_errorhandler(
3294 errors, &errorHandler,
3295 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003296 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003297 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003301 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003302 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003303 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003304 Py_XDECREF(errorHandler);
3305 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003307
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308 onError:
3309 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003310 Py_XDECREF(errorHandler);
3311 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 return NULL;
3313}
3314
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003316 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317 const char *errors)
3318{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003319 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320}
3321
3322PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3323{
3324 if (!PyUnicode_Check(unicode)) {
3325 PyErr_BadArgument();
3326 return NULL;
3327 }
3328 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3329 PyUnicode_GET_SIZE(unicode),
3330 NULL);
3331}
3332
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003333#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003334
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003335/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003336
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003337#if SIZEOF_INT < SIZEOF_SSIZE_T
3338#define NEED_RETRY
3339#endif
3340
3341/* XXX This code is limited to "true" double-byte encodings, as
3342 a) it assumes an incomplete character consists of a single byte, and
3343 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3344 encodings, see IsDBCSLeadByteEx documentation. */
3345
3346static int is_dbcs_lead_byte(const char *s, int offset)
3347{
3348 const char *curr = s + offset;
3349
3350 if (IsDBCSLeadByte(*curr)) {
3351 const char *prev = CharPrev(s, curr);
3352 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3353 }
3354 return 0;
3355}
3356
3357/*
3358 * Decode MBCS string into unicode object. If 'final' is set, converts
3359 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3360 */
3361static int decode_mbcs(PyUnicodeObject **v,
3362 const char *s, /* MBCS string */
3363 int size, /* sizeof MBCS string */
3364 int final)
3365{
3366 Py_UNICODE *p;
3367 Py_ssize_t n = 0;
3368 int usize = 0;
3369
3370 assert(size >= 0);
3371
3372 /* Skip trailing lead-byte unless 'final' is set */
3373 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3374 --size;
3375
3376 /* First get the size of the result */
3377 if (size > 0) {
3378 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3379 if (usize == 0) {
3380 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3381 return -1;
3382 }
3383 }
3384
3385 if (*v == NULL) {
3386 /* Create unicode object */
3387 *v = _PyUnicode_New(usize);
3388 if (*v == NULL)
3389 return -1;
3390 }
3391 else {
3392 /* Extend unicode object */
3393 n = PyUnicode_GET_SIZE(*v);
3394 if (_PyUnicode_Resize(v, n + usize) < 0)
3395 return -1;
3396 }
3397
3398 /* Do the conversion */
3399 if (size > 0) {
3400 p = PyUnicode_AS_UNICODE(*v) + n;
3401 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3402 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3403 return -1;
3404 }
3405 }
3406
3407 return size;
3408}
3409
3410PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3411 Py_ssize_t size,
3412 const char *errors,
3413 Py_ssize_t *consumed)
3414{
3415 PyUnicodeObject *v = NULL;
3416 int done;
3417
3418 if (consumed)
3419 *consumed = 0;
3420
3421#ifdef NEED_RETRY
3422 retry:
3423 if (size > INT_MAX)
3424 done = decode_mbcs(&v, s, INT_MAX, 0);
3425 else
3426#endif
3427 done = decode_mbcs(&v, s, (int)size, !consumed);
3428
3429 if (done < 0) {
3430 Py_XDECREF(v);
3431 return NULL;
3432 }
3433
3434 if (consumed)
3435 *consumed += done;
3436
3437#ifdef NEED_RETRY
3438 if (size > INT_MAX) {
3439 s += done;
3440 size -= done;
3441 goto retry;
3442 }
3443#endif
3444
3445 return (PyObject *)v;
3446}
3447
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003448PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003449 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003450 const char *errors)
3451{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003452 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3453}
3454
3455/*
3456 * Convert unicode into string object (MBCS).
3457 * Returns 0 if succeed, -1 otherwise.
3458 */
3459static int encode_mbcs(PyObject **repr,
3460 const Py_UNICODE *p, /* unicode */
3461 int size) /* size of unicode */
3462{
3463 int mbcssize = 0;
3464 Py_ssize_t n = 0;
3465
3466 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003467
3468 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003469 if (size > 0) {
3470 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3471 if (mbcssize == 0) {
3472 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3473 return -1;
3474 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003475 }
3476
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003477 if (*repr == NULL) {
3478 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003479 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003480 if (*repr == NULL)
3481 return -1;
3482 }
3483 else {
3484 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003485 n = PyBytes_Size(*repr);
3486 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003487 return -1;
3488 }
3489
3490 /* Do the conversion */
3491 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003492 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003493 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3494 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3495 return -1;
3496 }
3497 }
3498
3499 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003500}
3501
3502PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003503 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003504 const char *errors)
3505{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003506 PyObject *repr = NULL;
3507 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003508
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003509#ifdef NEED_RETRY
3510 retry:
3511 if (size > INT_MAX)
3512 ret = encode_mbcs(&repr, p, INT_MAX);
3513 else
3514#endif
3515 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003516
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003517 if (ret < 0) {
3518 Py_XDECREF(repr);
3519 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003520 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003521
3522#ifdef NEED_RETRY
3523 if (size > INT_MAX) {
3524 p += INT_MAX;
3525 size -= INT_MAX;
3526 goto retry;
3527 }
3528#endif
3529
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003530 return repr;
3531}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003532
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003533PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3534{
3535 if (!PyUnicode_Check(unicode)) {
3536 PyErr_BadArgument();
3537 return NULL;
3538 }
3539 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3540 PyUnicode_GET_SIZE(unicode),
3541 NULL);
3542}
3543
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003544#undef NEED_RETRY
3545
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003546#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003547
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548/* --- Character Mapping Codec -------------------------------------------- */
3549
Guido van Rossumd57fd912000-03-10 22:53:23 +00003550PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003551 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552 PyObject *mapping,
3553 const char *errors)
3554{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003556 Py_ssize_t startinpos;
3557 Py_ssize_t endinpos;
3558 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003560 PyUnicodeObject *v;
3561 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003562 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563 PyObject *errorHandler = NULL;
3564 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003565 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003566 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003567
Guido van Rossumd57fd912000-03-10 22:53:23 +00003568 /* Default to Latin-1 */
3569 if (mapping == NULL)
3570 return PyUnicode_DecodeLatin1(s, size, errors);
3571
3572 v = _PyUnicode_New(size);
3573 if (v == NULL)
3574 goto onError;
3575 if (size == 0)
3576 return (PyObject *)v;
3577 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003579 if (PyUnicode_CheckExact(mapping)) {
3580 mapstring = PyUnicode_AS_UNICODE(mapping);
3581 maplen = PyUnicode_GET_SIZE(mapping);
3582 while (s < e) {
3583 unsigned char ch = *s;
3584 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003585
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003586 if (ch < maplen)
3587 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003588
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003589 if (x == 0xfffe) {
3590 /* undefined mapping */
3591 outpos = p-PyUnicode_AS_UNICODE(v);
3592 startinpos = s-starts;
3593 endinpos = startinpos+1;
3594 if (unicode_decode_call_errorhandler(
3595 errors, &errorHandler,
3596 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003597 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003598 (PyObject **)&v, &outpos, &p)) {
3599 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003600 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003601 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003602 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003603 *p++ = x;
3604 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003605 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003606 }
3607 else {
3608 while (s < e) {
3609 unsigned char ch = *s;
3610 PyObject *w, *x;
3611
3612 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3613 w = PyInt_FromLong((long)ch);
3614 if (w == NULL)
3615 goto onError;
3616 x = PyObject_GetItem(mapping, w);
3617 Py_DECREF(w);
3618 if (x == NULL) {
3619 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3620 /* No mapping found means: mapping is undefined. */
3621 PyErr_Clear();
3622 x = Py_None;
3623 Py_INCREF(x);
3624 } else
3625 goto onError;
3626 }
3627
3628 /* Apply mapping */
3629 if (PyInt_Check(x)) {
3630 long value = PyInt_AS_LONG(x);
3631 if (value < 0 || value > 65535) {
3632 PyErr_SetString(PyExc_TypeError,
3633 "character mapping must be in range(65536)");
3634 Py_DECREF(x);
3635 goto onError;
3636 }
3637 *p++ = (Py_UNICODE)value;
3638 }
3639 else if (x == Py_None) {
3640 /* undefined mapping */
3641 outpos = p-PyUnicode_AS_UNICODE(v);
3642 startinpos = s-starts;
3643 endinpos = startinpos+1;
3644 if (unicode_decode_call_errorhandler(
3645 errors, &errorHandler,
3646 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003647 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003648 (PyObject **)&v, &outpos, &p)) {
3649 Py_DECREF(x);
3650 goto onError;
3651 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003652 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003653 continue;
3654 }
3655 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003656 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003657
3658 if (targetsize == 1)
3659 /* 1-1 mapping */
3660 *p++ = *PyUnicode_AS_UNICODE(x);
3661
3662 else if (targetsize > 1) {
3663 /* 1-n mapping */
3664 if (targetsize > extrachars) {
3665 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003666 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3667 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003668 (targetsize << 2);
3669 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003670 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003671 if (_PyUnicode_Resize(&v,
3672 PyUnicode_GET_SIZE(v) + needed) < 0) {
3673 Py_DECREF(x);
3674 goto onError;
3675 }
3676 p = PyUnicode_AS_UNICODE(v) + oldpos;
3677 }
3678 Py_UNICODE_COPY(p,
3679 PyUnicode_AS_UNICODE(x),
3680 targetsize);
3681 p += targetsize;
3682 extrachars -= targetsize;
3683 }
3684 /* 1-0 mapping: skip the character */
3685 }
3686 else {
3687 /* wrong return value */
3688 PyErr_SetString(PyExc_TypeError,
3689 "character mapping must return integer, None or unicode");
3690 Py_DECREF(x);
3691 goto onError;
3692 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003694 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696 }
3697 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003698 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003700 Py_XDECREF(errorHandler);
3701 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003703
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003705 Py_XDECREF(errorHandler);
3706 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 Py_XDECREF(v);
3708 return NULL;
3709}
3710
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003711/* Charmap encoding: the lookup table */
3712
3713struct encoding_map{
3714 PyObject_HEAD
3715 unsigned char level1[32];
3716 int count2, count3;
3717 unsigned char level23[1];
3718};
3719
3720static PyObject*
3721encoding_map_size(PyObject *obj, PyObject* args)
3722{
3723 struct encoding_map *map = (struct encoding_map*)obj;
3724 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3725 128*map->count3);
3726}
3727
3728static PyMethodDef encoding_map_methods[] = {
3729 {"size", encoding_map_size, METH_NOARGS,
3730 PyDoc_STR("Return the size (in bytes) of this object") },
3731 { 0 }
3732};
3733
3734static void
3735encoding_map_dealloc(PyObject* o)
3736{
3737 PyObject_FREE(o);
3738}
3739
3740static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003741 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003742 "EncodingMap", /*tp_name*/
3743 sizeof(struct encoding_map), /*tp_basicsize*/
3744 0, /*tp_itemsize*/
3745 /* methods */
3746 encoding_map_dealloc, /*tp_dealloc*/
3747 0, /*tp_print*/
3748 0, /*tp_getattr*/
3749 0, /*tp_setattr*/
3750 0, /*tp_compare*/
3751 0, /*tp_repr*/
3752 0, /*tp_as_number*/
3753 0, /*tp_as_sequence*/
3754 0, /*tp_as_mapping*/
3755 0, /*tp_hash*/
3756 0, /*tp_call*/
3757 0, /*tp_str*/
3758 0, /*tp_getattro*/
3759 0, /*tp_setattro*/
3760 0, /*tp_as_buffer*/
3761 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3762 0, /*tp_doc*/
3763 0, /*tp_traverse*/
3764 0, /*tp_clear*/
3765 0, /*tp_richcompare*/
3766 0, /*tp_weaklistoffset*/
3767 0, /*tp_iter*/
3768 0, /*tp_iternext*/
3769 encoding_map_methods, /*tp_methods*/
3770 0, /*tp_members*/
3771 0, /*tp_getset*/
3772 0, /*tp_base*/
3773 0, /*tp_dict*/
3774 0, /*tp_descr_get*/
3775 0, /*tp_descr_set*/
3776 0, /*tp_dictoffset*/
3777 0, /*tp_init*/
3778 0, /*tp_alloc*/
3779 0, /*tp_new*/
3780 0, /*tp_free*/
3781 0, /*tp_is_gc*/
3782};
3783
3784PyObject*
3785PyUnicode_BuildEncodingMap(PyObject* string)
3786{
3787 Py_UNICODE *decode;
3788 PyObject *result;
3789 struct encoding_map *mresult;
3790 int i;
3791 int need_dict = 0;
3792 unsigned char level1[32];
3793 unsigned char level2[512];
3794 unsigned char *mlevel1, *mlevel2, *mlevel3;
3795 int count2 = 0, count3 = 0;
3796
3797 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3798 PyErr_BadArgument();
3799 return NULL;
3800 }
3801 decode = PyUnicode_AS_UNICODE(string);
3802 memset(level1, 0xFF, sizeof level1);
3803 memset(level2, 0xFF, sizeof level2);
3804
3805 /* If there isn't a one-to-one mapping of NULL to \0,
3806 or if there are non-BMP characters, we need to use
3807 a mapping dictionary. */
3808 if (decode[0] != 0)
3809 need_dict = 1;
3810 for (i = 1; i < 256; i++) {
3811 int l1, l2;
3812 if (decode[i] == 0
3813 #ifdef Py_UNICODE_WIDE
3814 || decode[i] > 0xFFFF
3815 #endif
3816 ) {
3817 need_dict = 1;
3818 break;
3819 }
3820 if (decode[i] == 0xFFFE)
3821 /* unmapped character */
3822 continue;
3823 l1 = decode[i] >> 11;
3824 l2 = decode[i] >> 7;
3825 if (level1[l1] == 0xFF)
3826 level1[l1] = count2++;
3827 if (level2[l2] == 0xFF)
3828 level2[l2] = count3++;
3829 }
3830
3831 if (count2 >= 0xFF || count3 >= 0xFF)
3832 need_dict = 1;
3833
3834 if (need_dict) {
3835 PyObject *result = PyDict_New();
3836 PyObject *key, *value;
3837 if (!result)
3838 return NULL;
3839 for (i = 0; i < 256; i++) {
3840 key = value = NULL;
3841 key = PyInt_FromLong(decode[i]);
3842 value = PyInt_FromLong(i);
3843 if (!key || !value)
3844 goto failed1;
3845 if (PyDict_SetItem(result, key, value) == -1)
3846 goto failed1;
3847 Py_DECREF(key);
3848 Py_DECREF(value);
3849 }
3850 return result;
3851 failed1:
3852 Py_XDECREF(key);
3853 Py_XDECREF(value);
3854 Py_DECREF(result);
3855 return NULL;
3856 }
3857
3858 /* Create a three-level trie */
3859 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3860 16*count2 + 128*count3 - 1);
3861 if (!result)
3862 return PyErr_NoMemory();
3863 PyObject_Init(result, &EncodingMapType);
3864 mresult = (struct encoding_map*)result;
3865 mresult->count2 = count2;
3866 mresult->count3 = count3;
3867 mlevel1 = mresult->level1;
3868 mlevel2 = mresult->level23;
3869 mlevel3 = mresult->level23 + 16*count2;
3870 memcpy(mlevel1, level1, 32);
3871 memset(mlevel2, 0xFF, 16*count2);
3872 memset(mlevel3, 0, 128*count3);
3873 count3 = 0;
3874 for (i = 1; i < 256; i++) {
3875 int o1, o2, o3, i2, i3;
3876 if (decode[i] == 0xFFFE)
3877 /* unmapped character */
3878 continue;
3879 o1 = decode[i]>>11;
3880 o2 = (decode[i]>>7) & 0xF;
3881 i2 = 16*mlevel1[o1] + o2;
3882 if (mlevel2[i2] == 0xFF)
3883 mlevel2[i2] = count3++;
3884 o3 = decode[i] & 0x7F;
3885 i3 = 128*mlevel2[i2] + o3;
3886 mlevel3[i3] = i;
3887 }
3888 return result;
3889}
3890
3891static int
3892encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3893{
3894 struct encoding_map *map = (struct encoding_map*)mapping;
3895 int l1 = c>>11;
3896 int l2 = (c>>7) & 0xF;
3897 int l3 = c & 0x7F;
3898 int i;
3899
3900#ifdef Py_UNICODE_WIDE
3901 if (c > 0xFFFF) {
3902 return -1;
3903 }
3904#endif
3905 if (c == 0)
3906 return 0;
3907 /* level 1*/
3908 i = map->level1[l1];
3909 if (i == 0xFF) {
3910 return -1;
3911 }
3912 /* level 2*/
3913 i = map->level23[16*i+l2];
3914 if (i == 0xFF) {
3915 return -1;
3916 }
3917 /* level 3 */
3918 i = map->level23[16*map->count2 + 128*i + l3];
3919 if (i == 0) {
3920 return -1;
3921 }
3922 return i;
3923}
3924
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003925/* Lookup the character ch in the mapping. If the character
3926 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003927 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003928static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003929{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003930 PyObject *w = PyInt_FromLong((long)c);
3931 PyObject *x;
3932
3933 if (w == NULL)
3934 return NULL;
3935 x = PyObject_GetItem(mapping, w);
3936 Py_DECREF(w);
3937 if (x == NULL) {
3938 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3939 /* No mapping found means: mapping is undefined. */
3940 PyErr_Clear();
3941 x = Py_None;
3942 Py_INCREF(x);
3943 return x;
3944 } else
3945 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003946 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003947 else if (x == Py_None)
3948 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003949 else if (PyInt_Check(x)) {
3950 long value = PyInt_AS_LONG(x);
3951 if (value < 0 || value > 255) {
3952 PyErr_SetString(PyExc_TypeError,
3953 "character mapping must be in range(256)");
3954 Py_DECREF(x);
3955 return NULL;
3956 }
3957 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003959 else if (PyString_Check(x))
3960 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003963 PyErr_Format(PyExc_TypeError,
3964 "character mapping must return integer, None or str8, not %.400s",
3965 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 Py_DECREF(x);
3967 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968 }
3969}
3970
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003971static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003972charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003973{
Walter Dörwald827b0552007-05-12 13:23:53 +00003974 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003975 /* exponentially overallocate to minimize reallocations */
3976 if (requiredsize < 2*outsize)
3977 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003978 if (PyBytes_Resize(outobj, requiredsize)) {
3979 Py_DECREF(outobj);
3980 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003981 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003982 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003983}
3984
3985typedef enum charmapencode_result {
3986 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3987}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003989 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990 space is available. Return a new reference to the object that
3991 was put in the output buffer, or Py_None, if the mapping was undefined
3992 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003993 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003994static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003995charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003996 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003997{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003998 PyObject *rep;
3999 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00004000 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004001
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004002 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004003 int res = encoding_map_lookup(c, mapping);
4004 Py_ssize_t requiredsize = *outpos+1;
4005 if (res == -1)
4006 return enc_FAILED;
4007 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004008 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004009 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004010 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004011 outstart[(*outpos)++] = (char)res;
4012 return enc_SUCCESS;
4013 }
4014
4015 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004016 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004017 return enc_EXCEPTION;
4018 else if (rep==Py_None) {
4019 Py_DECREF(rep);
4020 return enc_FAILED;
4021 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004022 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004023 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004024 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004025 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004027 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004028 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004029 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004030 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4031 }
4032 else {
4033 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004034 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4035 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004036 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004037 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004039 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004040 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004041 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042 memcpy(outstart + *outpos, repchars, repsize);
4043 *outpos += repsize;
4044 }
4045 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004046 Py_DECREF(rep);
4047 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004048}
4049
4050/* handle an error in PyUnicode_EncodeCharmap
4051 Return 0 on success, -1 on error */
4052static
4053int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004054 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004055 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004056 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004057 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058{
4059 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004060 Py_ssize_t repsize;
4061 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062 Py_UNICODE *uni2;
4063 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004064 Py_ssize_t collstartpos = *inpos;
4065 Py_ssize_t collendpos = *inpos+1;
4066 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067 char *encoding = "charmap";
4068 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004069 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004070
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 /* find all unencodable characters */
4072 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004073 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004074 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004075 int res = encoding_map_lookup(p[collendpos], mapping);
4076 if (res != -1)
4077 break;
4078 ++collendpos;
4079 continue;
4080 }
4081
4082 rep = charmapencode_lookup(p[collendpos], mapping);
4083 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004084 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004085 else if (rep!=Py_None) {
4086 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087 break;
4088 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004089 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004090 ++collendpos;
4091 }
4092 /* cache callback name lookup
4093 * (if not done yet, i.e. it's the first error) */
4094 if (*known_errorHandler==-1) {
4095 if ((errors==NULL) || (!strcmp(errors, "strict")))
4096 *known_errorHandler = 1;
4097 else if (!strcmp(errors, "replace"))
4098 *known_errorHandler = 2;
4099 else if (!strcmp(errors, "ignore"))
4100 *known_errorHandler = 3;
4101 else if (!strcmp(errors, "xmlcharrefreplace"))
4102 *known_errorHandler = 4;
4103 else
4104 *known_errorHandler = 0;
4105 }
4106 switch (*known_errorHandler) {
4107 case 1: /* strict */
4108 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4109 return -1;
4110 case 2: /* replace */
4111 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4112 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004113 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114 return -1;
4115 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004116 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4118 return -1;
4119 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120 }
4121 /* fall through */
4122 case 3: /* ignore */
4123 *inpos = collendpos;
4124 break;
4125 case 4: /* xmlcharrefreplace */
4126 /* generate replacement (temporarily (mis)uses p) */
4127 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4128 char buffer[2+29+1+1];
4129 char *cp;
4130 sprintf(buffer, "&#%d;", (int)p[collpos]);
4131 for (cp = buffer; *cp; ++cp) {
4132 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004133 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004135 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004136 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4137 return -1;
4138 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139 }
4140 }
4141 *inpos = collendpos;
4142 break;
4143 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004144 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145 encoding, reason, p, size, exceptionObject,
4146 collstartpos, collendpos, &newpos);
4147 if (repunicode == NULL)
4148 return -1;
4149 /* generate replacement */
4150 repsize = PyUnicode_GET_SIZE(repunicode);
4151 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4152 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004153 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004154 return -1;
4155 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004156 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4159 return -1;
4160 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161 }
4162 *inpos = newpos;
4163 Py_DECREF(repunicode);
4164 }
4165 return 0;
4166}
4167
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004169 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170 PyObject *mapping,
4171 const char *errors)
4172{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004173 /* output object */
4174 PyObject *res = NULL;
4175 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004176 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004177 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004178 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179 PyObject *errorHandler = NULL;
4180 PyObject *exc = NULL;
4181 /* the following variable is used for caching string comparisons
4182 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4183 * 3=ignore, 4=xmlcharrefreplace */
4184 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185
4186 /* Default to Latin-1 */
4187 if (mapping == NULL)
4188 return PyUnicode_EncodeLatin1(p, size, errors);
4189
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190 /* allocate enough for a simple encoding without
4191 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004192 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 if (res == NULL)
4194 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004195 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198 while (inpos<size) {
4199 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004200 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004201 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004202 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004203 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 if (charmap_encoding_error(p, size, &inpos, mapping,
4205 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004206 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004207 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004208 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004209 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004210 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211 else
4212 /* done with this character => adjust input position */
4213 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004215
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004216 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004217 if (respos<PyBytes_GET_SIZE(res)) {
4218 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219 goto onError;
4220 }
4221 Py_XDECREF(exc);
4222 Py_XDECREF(errorHandler);
4223 return res;
4224
4225 onError:
4226 Py_XDECREF(res);
4227 Py_XDECREF(exc);
4228 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004229 return NULL;
4230}
4231
4232PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4233 PyObject *mapping)
4234{
4235 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4236 PyErr_BadArgument();
4237 return NULL;
4238 }
4239 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4240 PyUnicode_GET_SIZE(unicode),
4241 mapping,
4242 NULL);
4243}
4244
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245/* create or adjust a UnicodeTranslateError */
4246static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004247 const Py_UNICODE *unicode, Py_ssize_t size,
4248 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004249 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004250{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251 if (*exceptionObject == NULL) {
4252 *exceptionObject = PyUnicodeTranslateError_Create(
4253 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254 }
4255 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4257 goto onError;
4258 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4259 goto onError;
4260 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4261 goto onError;
4262 return;
4263 onError:
4264 Py_DECREF(*exceptionObject);
4265 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004266 }
4267}
4268
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004269/* raises a UnicodeTranslateError */
4270static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004271 const Py_UNICODE *unicode, Py_ssize_t size,
4272 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004273 const char *reason)
4274{
4275 make_translate_exception(exceptionObject,
4276 unicode, size, startpos, endpos, reason);
4277 if (*exceptionObject != NULL)
4278 PyCodec_StrictErrors(*exceptionObject);
4279}
4280
4281/* error handling callback helper:
4282 build arguments, call the callback and check the arguments,
4283 put the result into newpos and return the replacement string, which
4284 has to be freed by the caller */
4285static PyObject *unicode_translate_call_errorhandler(const char *errors,
4286 PyObject **errorHandler,
4287 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004288 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4289 Py_ssize_t startpos, Py_ssize_t endpos,
4290 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004292 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004293
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004294 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 PyObject *restuple;
4296 PyObject *resunicode;
4297
4298 if (*errorHandler == NULL) {
4299 *errorHandler = PyCodec_LookupError(errors);
4300 if (*errorHandler == NULL)
4301 return NULL;
4302 }
4303
4304 make_translate_exception(exceptionObject,
4305 unicode, size, startpos, endpos, reason);
4306 if (*exceptionObject == NULL)
4307 return NULL;
4308
4309 restuple = PyObject_CallFunctionObjArgs(
4310 *errorHandler, *exceptionObject, NULL);
4311 if (restuple == NULL)
4312 return NULL;
4313 if (!PyTuple_Check(restuple)) {
4314 PyErr_Format(PyExc_TypeError, &argparse[4]);
4315 Py_DECREF(restuple);
4316 return NULL;
4317 }
4318 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004319 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004320 Py_DECREF(restuple);
4321 return NULL;
4322 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004323 if (i_newpos<0)
4324 *newpos = size+i_newpos;
4325 else
4326 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004327 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004328 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004329 Py_DECREF(restuple);
4330 return NULL;
4331 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004332 Py_INCREF(resunicode);
4333 Py_DECREF(restuple);
4334 return resunicode;
4335}
4336
4337/* Lookup the character ch in the mapping and put the result in result,
4338 which must be decrefed by the caller.
4339 Return 0 on success, -1 on error */
4340static
4341int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4342{
4343 PyObject *w = PyInt_FromLong((long)c);
4344 PyObject *x;
4345
4346 if (w == NULL)
4347 return -1;
4348 x = PyObject_GetItem(mapping, w);
4349 Py_DECREF(w);
4350 if (x == NULL) {
4351 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4352 /* No mapping found means: use 1:1 mapping. */
4353 PyErr_Clear();
4354 *result = NULL;
4355 return 0;
4356 } else
4357 return -1;
4358 }
4359 else if (x == Py_None) {
4360 *result = x;
4361 return 0;
4362 }
4363 else if (PyInt_Check(x)) {
4364 long value = PyInt_AS_LONG(x);
4365 long max = PyUnicode_GetMax();
4366 if (value < 0 || value > max) {
4367 PyErr_Format(PyExc_TypeError,
4368 "character mapping must be in range(0x%lx)", max+1);
4369 Py_DECREF(x);
4370 return -1;
4371 }
4372 *result = x;
4373 return 0;
4374 }
4375 else if (PyUnicode_Check(x)) {
4376 *result = x;
4377 return 0;
4378 }
4379 else {
4380 /* wrong return value */
4381 PyErr_SetString(PyExc_TypeError,
4382 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004383 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004384 return -1;
4385 }
4386}
4387/* ensure that *outobj is at least requiredsize characters long,
4388if not reallocate and adjust various state variables.
4389Return 0 on success, -1 on error */
4390static
Walter Dörwald4894c302003-10-24 14:25:28 +00004391int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004392 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004394 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004395 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004397 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004399 if (requiredsize < 2 * oldsize)
4400 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004401 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 return -1;
4403 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004404 }
4405 return 0;
4406}
4407/* lookup the character, put the result in the output string and adjust
4408 various state variables. Return a new reference to the object that
4409 was put in the output buffer in *result, or Py_None, if the mapping was
4410 undefined (in which case no character was written).
4411 The called must decref result.
4412 Return 0 on success, -1 on error. */
4413static
Walter Dörwald4894c302003-10-24 14:25:28 +00004414int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004415 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004416 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417{
Walter Dörwald4894c302003-10-24 14:25:28 +00004418 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419 return -1;
4420 if (*res==NULL) {
4421 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004422 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004423 }
4424 else if (*res==Py_None)
4425 ;
4426 else if (PyInt_Check(*res)) {
4427 /* no overflow check, because we know that the space is enough */
4428 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4429 }
4430 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004431 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004432 if (repsize==1) {
4433 /* no overflow check, because we know that the space is enough */
4434 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4435 }
4436 else if (repsize!=0) {
4437 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004438 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004439 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004440 repsize - 1;
4441 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 return -1;
4443 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4444 *outp += repsize;
4445 }
4446 }
4447 else
4448 return -1;
4449 return 0;
4450}
4451
4452PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004453 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454 PyObject *mapping,
4455 const char *errors)
4456{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004457 /* output object */
4458 PyObject *res = NULL;
4459 /* pointers to the beginning and end+1 of input */
4460 const Py_UNICODE *startp = p;
4461 const Py_UNICODE *endp = p + size;
4462 /* pointer into the output */
4463 Py_UNICODE *str;
4464 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004465 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004466 char *reason = "character maps to <undefined>";
4467 PyObject *errorHandler = NULL;
4468 PyObject *exc = NULL;
4469 /* the following variable is used for caching string comparisons
4470 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4471 * 3=ignore, 4=xmlcharrefreplace */
4472 int known_errorHandler = -1;
4473
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474 if (mapping == NULL) {
4475 PyErr_BadArgument();
4476 return NULL;
4477 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478
4479 /* allocate enough for a simple 1:1 translation without
4480 replacements, if we need more, we'll resize */
4481 res = PyUnicode_FromUnicode(NULL, size);
4482 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004483 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 return res;
4486 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004487
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004488 while (p<endp) {
4489 /* try to encode it */
4490 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004491 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004492 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493 goto onError;
4494 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004495 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004496 if (x!=Py_None) /* it worked => adjust input pointer */
4497 ++p;
4498 else { /* untranslatable character */
4499 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004500 Py_ssize_t repsize;
4501 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004502 Py_UNICODE *uni2;
4503 /* startpos for collecting untranslatable chars */
4504 const Py_UNICODE *collstart = p;
4505 const Py_UNICODE *collend = p+1;
4506 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508 /* find all untranslatable characters */
4509 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004510 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 goto onError;
4512 Py_XDECREF(x);
4513 if (x!=Py_None)
4514 break;
4515 ++collend;
4516 }
4517 /* cache callback name lookup
4518 * (if not done yet, i.e. it's the first error) */
4519 if (known_errorHandler==-1) {
4520 if ((errors==NULL) || (!strcmp(errors, "strict")))
4521 known_errorHandler = 1;
4522 else if (!strcmp(errors, "replace"))
4523 known_errorHandler = 2;
4524 else if (!strcmp(errors, "ignore"))
4525 known_errorHandler = 3;
4526 else if (!strcmp(errors, "xmlcharrefreplace"))
4527 known_errorHandler = 4;
4528 else
4529 known_errorHandler = 0;
4530 }
4531 switch (known_errorHandler) {
4532 case 1: /* strict */
4533 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4534 goto onError;
4535 case 2: /* replace */
4536 /* No need to check for space, this is a 1:1 replacement */
4537 for (coll = collstart; coll<collend; ++coll)
4538 *str++ = '?';
4539 /* fall through */
4540 case 3: /* ignore */
4541 p = collend;
4542 break;
4543 case 4: /* xmlcharrefreplace */
4544 /* generate replacement (temporarily (mis)uses p) */
4545 for (p = collstart; p < collend; ++p) {
4546 char buffer[2+29+1+1];
4547 char *cp;
4548 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004549 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4551 goto onError;
4552 for (cp = buffer; *cp; ++cp)
4553 *str++ = *cp;
4554 }
4555 p = collend;
4556 break;
4557 default:
4558 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4559 reason, startp, size, &exc,
4560 collstart-startp, collend-startp, &newpos);
4561 if (repunicode == NULL)
4562 goto onError;
4563 /* generate replacement */
4564 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004565 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4567 Py_DECREF(repunicode);
4568 goto onError;
4569 }
4570 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4571 *str++ = *uni2;
4572 p = startp + newpos;
4573 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574 }
4575 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004576 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577 /* Resize if we allocated to much */
4578 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004579 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004580 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004581 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582 }
4583 Py_XDECREF(exc);
4584 Py_XDECREF(errorHandler);
4585 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587 onError:
4588 Py_XDECREF(res);
4589 Py_XDECREF(exc);
4590 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004591 return NULL;
4592}
4593
4594PyObject *PyUnicode_Translate(PyObject *str,
4595 PyObject *mapping,
4596 const char *errors)
4597{
4598 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004599
Guido van Rossumd57fd912000-03-10 22:53:23 +00004600 str = PyUnicode_FromObject(str);
4601 if (str == NULL)
4602 goto onError;
4603 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4604 PyUnicode_GET_SIZE(str),
4605 mapping,
4606 errors);
4607 Py_DECREF(str);
4608 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004609
Guido van Rossumd57fd912000-03-10 22:53:23 +00004610 onError:
4611 Py_XDECREF(str);
4612 return NULL;
4613}
Tim Petersced69f82003-09-16 20:30:58 +00004614
Guido van Rossum9e896b32000-04-05 20:11:21 +00004615/* --- Decimal Encoder ---------------------------------------------------- */
4616
4617int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004618 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004619 char *output,
4620 const char *errors)
4621{
4622 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623 PyObject *errorHandler = NULL;
4624 PyObject *exc = NULL;
4625 const char *encoding = "decimal";
4626 const char *reason = "invalid decimal Unicode string";
4627 /* the following variable is used for caching string comparisons
4628 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4629 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004630
4631 if (output == NULL) {
4632 PyErr_BadArgument();
4633 return -1;
4634 }
4635
4636 p = s;
4637 end = s + length;
4638 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004639 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004640 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004642 Py_ssize_t repsize;
4643 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004644 Py_UNICODE *uni2;
4645 Py_UNICODE *collstart;
4646 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004647
Guido van Rossum9e896b32000-04-05 20:11:21 +00004648 if (Py_UNICODE_ISSPACE(ch)) {
4649 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004650 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004651 continue;
4652 }
4653 decimal = Py_UNICODE_TODECIMAL(ch);
4654 if (decimal >= 0) {
4655 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004656 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004657 continue;
4658 }
Guido van Rossumba477042000-04-06 18:18:10 +00004659 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004660 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004661 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004662 continue;
4663 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004664 /* All other characters are considered unencodable */
4665 collstart = p;
4666 collend = p+1;
4667 while (collend < end) {
4668 if ((0 < *collend && *collend < 256) ||
4669 !Py_UNICODE_ISSPACE(*collend) ||
4670 Py_UNICODE_TODECIMAL(*collend))
4671 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004672 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004673 /* cache callback name lookup
4674 * (if not done yet, i.e. it's the first error) */
4675 if (known_errorHandler==-1) {
4676 if ((errors==NULL) || (!strcmp(errors, "strict")))
4677 known_errorHandler = 1;
4678 else if (!strcmp(errors, "replace"))
4679 known_errorHandler = 2;
4680 else if (!strcmp(errors, "ignore"))
4681 known_errorHandler = 3;
4682 else if (!strcmp(errors, "xmlcharrefreplace"))
4683 known_errorHandler = 4;
4684 else
4685 known_errorHandler = 0;
4686 }
4687 switch (known_errorHandler) {
4688 case 1: /* strict */
4689 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4690 goto onError;
4691 case 2: /* replace */
4692 for (p = collstart; p < collend; ++p)
4693 *output++ = '?';
4694 /* fall through */
4695 case 3: /* ignore */
4696 p = collend;
4697 break;
4698 case 4: /* xmlcharrefreplace */
4699 /* generate replacement (temporarily (mis)uses p) */
4700 for (p = collstart; p < collend; ++p)
4701 output += sprintf(output, "&#%d;", (int)*p);
4702 p = collend;
4703 break;
4704 default:
4705 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4706 encoding, reason, s, length, &exc,
4707 collstart-s, collend-s, &newpos);
4708 if (repunicode == NULL)
4709 goto onError;
4710 /* generate replacement */
4711 repsize = PyUnicode_GET_SIZE(repunicode);
4712 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4713 Py_UNICODE ch = *uni2;
4714 if (Py_UNICODE_ISSPACE(ch))
4715 *output++ = ' ';
4716 else {
4717 decimal = Py_UNICODE_TODECIMAL(ch);
4718 if (decimal >= 0)
4719 *output++ = '0' + decimal;
4720 else if (0 < ch && ch < 256)
4721 *output++ = (char)ch;
4722 else {
4723 Py_DECREF(repunicode);
4724 raise_encode_exception(&exc, encoding,
4725 s, length, collstart-s, collend-s, reason);
4726 goto onError;
4727 }
4728 }
4729 }
4730 p = s + newpos;
4731 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004732 }
4733 }
4734 /* 0-terminate the output string */
4735 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004736 Py_XDECREF(exc);
4737 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004738 return 0;
4739
4740 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004741 Py_XDECREF(exc);
4742 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004743 return -1;
4744}
4745
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746/* --- Helpers ------------------------------------------------------------ */
4747
Thomas Wouters477c8d52006-05-27 19:21:47 +00004748#define STRINGLIB_CHAR Py_UNICODE
4749
4750#define STRINGLIB_LEN PyUnicode_GET_SIZE
4751#define STRINGLIB_NEW PyUnicode_FromUnicode
4752#define STRINGLIB_STR PyUnicode_AS_UNICODE
4753
4754Py_LOCAL_INLINE(int)
4755STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004757 if (str[0] != other[0])
4758 return 1;
4759 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760}
4761
Thomas Wouters477c8d52006-05-27 19:21:47 +00004762#define STRINGLIB_EMPTY unicode_empty
4763
4764#include "stringlib/fastsearch.h"
4765
4766#include "stringlib/count.h"
4767#include "stringlib/find.h"
4768#include "stringlib/partition.h"
4769
4770/* helper macro to fixup start/end slice values */
4771#define FIX_START_END(obj) \
4772 if (start < 0) \
4773 start += (obj)->length; \
4774 if (start < 0) \
4775 start = 0; \
4776 if (end > (obj)->length) \
4777 end = (obj)->length; \
4778 if (end < 0) \
4779 end += (obj)->length; \
4780 if (end < 0) \
4781 end = 0;
4782
Martin v. Löwis18e16552006-02-15 17:27:45 +00004783Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004784 PyObject *substr,
4785 Py_ssize_t start,
4786 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004787{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004788 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004789 PyUnicodeObject* str_obj;
4790 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004791
Thomas Wouters477c8d52006-05-27 19:21:47 +00004792 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4793 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004795 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4796 if (!sub_obj) {
4797 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798 return -1;
4799 }
Tim Petersced69f82003-09-16 20:30:58 +00004800
Thomas Wouters477c8d52006-05-27 19:21:47 +00004801 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004802
Thomas Wouters477c8d52006-05-27 19:21:47 +00004803 result = stringlib_count(
4804 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4805 );
4806
4807 Py_DECREF(sub_obj);
4808 Py_DECREF(str_obj);
4809
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 return result;
4811}
4812
Martin v. Löwis18e16552006-02-15 17:27:45 +00004813Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004814 PyObject *sub,
4815 Py_ssize_t start,
4816 Py_ssize_t end,
4817 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004819 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004820
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004822 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004823 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004824 sub = PyUnicode_FromObject(sub);
4825 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004826 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004827 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828 }
Tim Petersced69f82003-09-16 20:30:58 +00004829
Thomas Wouters477c8d52006-05-27 19:21:47 +00004830 if (direction > 0)
4831 result = stringlib_find_slice(
4832 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4833 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4834 start, end
4835 );
4836 else
4837 result = stringlib_rfind_slice(
4838 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4839 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4840 start, end
4841 );
4842
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004844 Py_DECREF(sub);
4845
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 return result;
4847}
4848
Tim Petersced69f82003-09-16 20:30:58 +00004849static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850int tailmatch(PyUnicodeObject *self,
4851 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004852 Py_ssize_t start,
4853 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854 int direction)
4855{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 if (substring->length == 0)
4857 return 1;
4858
Thomas Wouters477c8d52006-05-27 19:21:47 +00004859 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860
4861 end -= substring->length;
4862 if (end < start)
4863 return 0;
4864
4865 if (direction > 0) {
4866 if (Py_UNICODE_MATCH(self, end, substring))
4867 return 1;
4868 } else {
4869 if (Py_UNICODE_MATCH(self, start, substring))
4870 return 1;
4871 }
4872
4873 return 0;
4874}
4875
Martin v. Löwis18e16552006-02-15 17:27:45 +00004876Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004878 Py_ssize_t start,
4879 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 int direction)
4881{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004882 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004883
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884 str = PyUnicode_FromObject(str);
4885 if (str == NULL)
4886 return -1;
4887 substr = PyUnicode_FromObject(substr);
4888 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004889 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890 return -1;
4891 }
Tim Petersced69f82003-09-16 20:30:58 +00004892
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893 result = tailmatch((PyUnicodeObject *)str,
4894 (PyUnicodeObject *)substr,
4895 start, end, direction);
4896 Py_DECREF(str);
4897 Py_DECREF(substr);
4898 return result;
4899}
4900
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901/* Apply fixfct filter to the Unicode object self and return a
4902 reference to the modified object */
4903
Tim Petersced69f82003-09-16 20:30:58 +00004904static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905PyObject *fixup(PyUnicodeObject *self,
4906 int (*fixfct)(PyUnicodeObject *s))
4907{
4908
4909 PyUnicodeObject *u;
4910
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004911 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912 if (u == NULL)
4913 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004914
4915 Py_UNICODE_COPY(u->str, self->str, self->length);
4916
Tim Peters7a29bd52001-09-12 03:03:31 +00004917 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918 /* fixfct should return TRUE if it modified the buffer. If
4919 FALSE, return a reference to the original buffer instead
4920 (to save space, not time) */
4921 Py_INCREF(self);
4922 Py_DECREF(u);
4923 return (PyObject*) self;
4924 }
4925 return (PyObject*) u;
4926}
4927
Tim Petersced69f82003-09-16 20:30:58 +00004928static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929int fixupper(PyUnicodeObject *self)
4930{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004931 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932 Py_UNICODE *s = self->str;
4933 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004934
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 while (len-- > 0) {
4936 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004937
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 ch = Py_UNICODE_TOUPPER(*s);
4939 if (ch != *s) {
4940 status = 1;
4941 *s = ch;
4942 }
4943 s++;
4944 }
4945
4946 return status;
4947}
4948
Tim Petersced69f82003-09-16 20:30:58 +00004949static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950int fixlower(PyUnicodeObject *self)
4951{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004952 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004953 Py_UNICODE *s = self->str;
4954 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004955
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956 while (len-- > 0) {
4957 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004958
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959 ch = Py_UNICODE_TOLOWER(*s);
4960 if (ch != *s) {
4961 status = 1;
4962 *s = ch;
4963 }
4964 s++;
4965 }
4966
4967 return status;
4968}
4969
Tim Petersced69f82003-09-16 20:30:58 +00004970static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004971int fixswapcase(PyUnicodeObject *self)
4972{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004973 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974 Py_UNICODE *s = self->str;
4975 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004976
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977 while (len-- > 0) {
4978 if (Py_UNICODE_ISUPPER(*s)) {
4979 *s = Py_UNICODE_TOLOWER(*s);
4980 status = 1;
4981 } else if (Py_UNICODE_ISLOWER(*s)) {
4982 *s = Py_UNICODE_TOUPPER(*s);
4983 status = 1;
4984 }
4985 s++;
4986 }
4987
4988 return status;
4989}
4990
Tim Petersced69f82003-09-16 20:30:58 +00004991static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992int fixcapitalize(PyUnicodeObject *self)
4993{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004994 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004995 Py_UNICODE *s = self->str;
4996 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004997
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004998 if (len == 0)
4999 return 0;
5000 if (Py_UNICODE_ISLOWER(*s)) {
5001 *s = Py_UNICODE_TOUPPER(*s);
5002 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005003 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005004 s++;
5005 while (--len > 0) {
5006 if (Py_UNICODE_ISUPPER(*s)) {
5007 *s = Py_UNICODE_TOLOWER(*s);
5008 status = 1;
5009 }
5010 s++;
5011 }
5012 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013}
5014
5015static
5016int fixtitle(PyUnicodeObject *self)
5017{
5018 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5019 register Py_UNICODE *e;
5020 int previous_is_cased;
5021
5022 /* Shortcut for single character strings */
5023 if (PyUnicode_GET_SIZE(self) == 1) {
5024 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5025 if (*p != ch) {
5026 *p = ch;
5027 return 1;
5028 }
5029 else
5030 return 0;
5031 }
Tim Petersced69f82003-09-16 20:30:58 +00005032
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033 e = p + PyUnicode_GET_SIZE(self);
5034 previous_is_cased = 0;
5035 for (; p < e; p++) {
5036 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005037
Guido van Rossumd57fd912000-03-10 22:53:23 +00005038 if (previous_is_cased)
5039 *p = Py_UNICODE_TOLOWER(ch);
5040 else
5041 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005042
5043 if (Py_UNICODE_ISLOWER(ch) ||
5044 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005045 Py_UNICODE_ISTITLE(ch))
5046 previous_is_cased = 1;
5047 else
5048 previous_is_cased = 0;
5049 }
5050 return 1;
5051}
5052
Tim Peters8ce9f162004-08-27 01:49:32 +00005053PyObject *
5054PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055{
Tim Peters8ce9f162004-08-27 01:49:32 +00005056 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005057 const Py_UNICODE blank = ' ';
5058 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005059 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005060 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005061 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5062 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005063 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5064 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005065 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005066 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005067 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068
Tim Peters05eba1f2004-08-27 21:32:02 +00005069 fseq = PySequence_Fast(seq, "");
5070 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005071 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005072 }
5073
Tim Peters91879ab2004-08-27 22:35:44 +00005074 /* Grrrr. A codec may be invoked to convert str objects to
5075 * Unicode, and so it's possible to call back into Python code
5076 * during PyUnicode_FromObject(), and so it's possible for a sick
5077 * codec to change the size of fseq (if seq is a list). Therefore
5078 * we have to keep refetching the size -- can't assume seqlen
5079 * is invariant.
5080 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005081 seqlen = PySequence_Fast_GET_SIZE(fseq);
5082 /* If empty sequence, return u"". */
5083 if (seqlen == 0) {
5084 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5085 goto Done;
5086 }
5087 /* If singleton sequence with an exact Unicode, return that. */
5088 if (seqlen == 1) {
5089 item = PySequence_Fast_GET_ITEM(fseq, 0);
5090 if (PyUnicode_CheckExact(item)) {
5091 Py_INCREF(item);
5092 res = (PyUnicodeObject *)item;
5093 goto Done;
5094 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005095 }
5096
Tim Peters05eba1f2004-08-27 21:32:02 +00005097 /* At least two items to join, or one that isn't exact Unicode. */
5098 if (seqlen > 1) {
5099 /* Set up sep and seplen -- they're needed. */
5100 if (separator == NULL) {
5101 sep = &blank;
5102 seplen = 1;
5103 }
5104 else {
5105 internal_separator = PyUnicode_FromObject(separator);
5106 if (internal_separator == NULL)
5107 goto onError;
5108 sep = PyUnicode_AS_UNICODE(internal_separator);
5109 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005110 /* In case PyUnicode_FromObject() mutated seq. */
5111 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005112 }
5113 }
5114
5115 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005116 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005117 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005118 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005119 res_p = PyUnicode_AS_UNICODE(res);
5120 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005121
Tim Peters05eba1f2004-08-27 21:32:02 +00005122 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005123 Py_ssize_t itemlen;
5124 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005125
5126 item = PySequence_Fast_GET_ITEM(fseq, i);
5127 /* Convert item to Unicode. */
5128 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5129 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005130 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005131 " %.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00005132 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005133 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005134 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005135 item = PyUnicode_FromObject(item);
5136 if (item == NULL)
5137 goto onError;
5138 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005139
Tim Peters91879ab2004-08-27 22:35:44 +00005140 /* In case PyUnicode_FromObject() mutated seq. */
5141 seqlen = PySequence_Fast_GET_SIZE(fseq);
5142
Tim Peters8ce9f162004-08-27 01:49:32 +00005143 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005145 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005146 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005147 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005148 if (i < seqlen - 1) {
5149 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005150 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005151 goto Overflow;
5152 }
5153 if (new_res_used > res_alloc) {
5154 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005155 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005156 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005157 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005158 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005159 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005160 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005161 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005163 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005164 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005166
5167 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005168 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005169 res_p += itemlen;
5170 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005171 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005172 res_p += seplen;
5173 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005175 res_used = new_res_used;
5176 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005177
Tim Peters05eba1f2004-08-27 21:32:02 +00005178 /* Shrink res to match the used area; this probably can't fail,
5179 * but it's cheap to check.
5180 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005181 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005182 goto onError;
5183
5184 Done:
5185 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005186 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187 return (PyObject *)res;
5188
Tim Peters8ce9f162004-08-27 01:49:32 +00005189 Overflow:
5190 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005191 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005192 Py_DECREF(item);
5193 /* fall through */
5194
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005196 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005197 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005198 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199 return NULL;
5200}
5201
Tim Petersced69f82003-09-16 20:30:58 +00005202static
5203PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005204 Py_ssize_t left,
5205 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206 Py_UNICODE fill)
5207{
5208 PyUnicodeObject *u;
5209
5210 if (left < 0)
5211 left = 0;
5212 if (right < 0)
5213 right = 0;
5214
Tim Peters7a29bd52001-09-12 03:03:31 +00005215 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 Py_INCREF(self);
5217 return self;
5218 }
5219
5220 u = _PyUnicode_New(left + self->length + right);
5221 if (u) {
5222 if (left)
5223 Py_UNICODE_FILL(u->str, fill, left);
5224 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5225 if (right)
5226 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5227 }
5228
5229 return u;
5230}
5231
5232#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005233 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234 if (!str) \
5235 goto onError; \
5236 if (PyList_Append(list, str)) { \
5237 Py_DECREF(str); \
5238 goto onError; \
5239 } \
5240 else \
5241 Py_DECREF(str);
5242
5243static
5244PyObject *split_whitespace(PyUnicodeObject *self,
5245 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005246 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005248 register Py_ssize_t i;
5249 register Py_ssize_t j;
5250 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251 PyObject *str;
5252
5253 for (i = j = 0; i < len; ) {
5254 /* find a token */
5255 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5256 i++;
5257 j = i;
5258 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5259 i++;
5260 if (j < i) {
5261 if (maxcount-- <= 0)
5262 break;
5263 SPLIT_APPEND(self->str, j, i);
5264 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5265 i++;
5266 j = i;
5267 }
5268 }
5269 if (j < len) {
5270 SPLIT_APPEND(self->str, j, len);
5271 }
5272 return list;
5273
5274 onError:
5275 Py_DECREF(list);
5276 return NULL;
5277}
5278
5279PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005280 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005282 register Py_ssize_t i;
5283 register Py_ssize_t j;
5284 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285 PyObject *list;
5286 PyObject *str;
5287 Py_UNICODE *data;
5288
5289 string = PyUnicode_FromObject(string);
5290 if (string == NULL)
5291 return NULL;
5292 data = PyUnicode_AS_UNICODE(string);
5293 len = PyUnicode_GET_SIZE(string);
5294
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 list = PyList_New(0);
5296 if (!list)
5297 goto onError;
5298
5299 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005300 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005301
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005303 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305
5306 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005307 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308 if (i < len) {
5309 if (data[i] == '\r' && i + 1 < len &&
5310 data[i+1] == '\n')
5311 i += 2;
5312 else
5313 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005314 if (keepends)
5315 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 }
Guido van Rossum86662912000-04-11 15:38:46 +00005317 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 j = i;
5319 }
5320 if (j < len) {
5321 SPLIT_APPEND(data, j, len);
5322 }
5323
5324 Py_DECREF(string);
5325 return list;
5326
5327 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005328 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329 Py_DECREF(string);
5330 return NULL;
5331}
5332
Tim Petersced69f82003-09-16 20:30:58 +00005333static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334PyObject *split_char(PyUnicodeObject *self,
5335 PyObject *list,
5336 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005337 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005339 register Py_ssize_t i;
5340 register Py_ssize_t j;
5341 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342 PyObject *str;
5343
5344 for (i = j = 0; i < len; ) {
5345 if (self->str[i] == ch) {
5346 if (maxcount-- <= 0)
5347 break;
5348 SPLIT_APPEND(self->str, j, i);
5349 i = j = i + 1;
5350 } else
5351 i++;
5352 }
5353 if (j <= len) {
5354 SPLIT_APPEND(self->str, j, len);
5355 }
5356 return list;
5357
5358 onError:
5359 Py_DECREF(list);
5360 return NULL;
5361}
5362
Tim Petersced69f82003-09-16 20:30:58 +00005363static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364PyObject *split_substring(PyUnicodeObject *self,
5365 PyObject *list,
5366 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005367 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005369 register Py_ssize_t i;
5370 register Py_ssize_t j;
5371 Py_ssize_t len = self->length;
5372 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373 PyObject *str;
5374
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005375 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376 if (Py_UNICODE_MATCH(self, i, substring)) {
5377 if (maxcount-- <= 0)
5378 break;
5379 SPLIT_APPEND(self->str, j, i);
5380 i = j = i + sublen;
5381 } else
5382 i++;
5383 }
5384 if (j <= len) {
5385 SPLIT_APPEND(self->str, j, len);
5386 }
5387 return list;
5388
5389 onError:
5390 Py_DECREF(list);
5391 return NULL;
5392}
5393
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005394static
5395PyObject *rsplit_whitespace(PyUnicodeObject *self,
5396 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005397 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005398{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005399 register Py_ssize_t i;
5400 register Py_ssize_t j;
5401 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005402 PyObject *str;
5403
5404 for (i = j = len - 1; i >= 0; ) {
5405 /* find a token */
5406 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5407 i--;
5408 j = i;
5409 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5410 i--;
5411 if (j > i) {
5412 if (maxcount-- <= 0)
5413 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005414 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005415 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5416 i--;
5417 j = i;
5418 }
5419 }
5420 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005421 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005422 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005423 if (PyList_Reverse(list) < 0)
5424 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005425 return list;
5426
5427 onError:
5428 Py_DECREF(list);
5429 return NULL;
5430}
5431
5432static
5433PyObject *rsplit_char(PyUnicodeObject *self,
5434 PyObject *list,
5435 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005436 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005437{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005438 register Py_ssize_t i;
5439 register Py_ssize_t j;
5440 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005441 PyObject *str;
5442
5443 for (i = j = len - 1; i >= 0; ) {
5444 if (self->str[i] == ch) {
5445 if (maxcount-- <= 0)
5446 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005447 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005448 j = i = i - 1;
5449 } else
5450 i--;
5451 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005452 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005453 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005454 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005455 if (PyList_Reverse(list) < 0)
5456 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005457 return list;
5458
5459 onError:
5460 Py_DECREF(list);
5461 return NULL;
5462}
5463
5464static
5465PyObject *rsplit_substring(PyUnicodeObject *self,
5466 PyObject *list,
5467 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005468 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005469{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005470 register Py_ssize_t i;
5471 register Py_ssize_t j;
5472 Py_ssize_t len = self->length;
5473 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005474 PyObject *str;
5475
5476 for (i = len - sublen, j = len; i >= 0; ) {
5477 if (Py_UNICODE_MATCH(self, i, substring)) {
5478 if (maxcount-- <= 0)
5479 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005480 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005481 j = i;
5482 i -= sublen;
5483 } else
5484 i--;
5485 }
5486 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005487 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005488 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005489 if (PyList_Reverse(list) < 0)
5490 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005491 return list;
5492
5493 onError:
5494 Py_DECREF(list);
5495 return NULL;
5496}
5497
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498#undef SPLIT_APPEND
5499
5500static
5501PyObject *split(PyUnicodeObject *self,
5502 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005503 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504{
5505 PyObject *list;
5506
5507 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005508 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509
5510 list = PyList_New(0);
5511 if (!list)
5512 return NULL;
5513
5514 if (substring == NULL)
5515 return split_whitespace(self,list,maxcount);
5516
5517 else if (substring->length == 1)
5518 return split_char(self,list,substring->str[0],maxcount);
5519
5520 else if (substring->length == 0) {
5521 Py_DECREF(list);
5522 PyErr_SetString(PyExc_ValueError, "empty separator");
5523 return NULL;
5524 }
5525 else
5526 return split_substring(self,list,substring,maxcount);
5527}
5528
Tim Petersced69f82003-09-16 20:30:58 +00005529static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005530PyObject *rsplit(PyUnicodeObject *self,
5531 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005532 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005533{
5534 PyObject *list;
5535
5536 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005537 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005538
5539 list = PyList_New(0);
5540 if (!list)
5541 return NULL;
5542
5543 if (substring == NULL)
5544 return rsplit_whitespace(self,list,maxcount);
5545
5546 else if (substring->length == 1)
5547 return rsplit_char(self,list,substring->str[0],maxcount);
5548
5549 else if (substring->length == 0) {
5550 Py_DECREF(list);
5551 PyErr_SetString(PyExc_ValueError, "empty separator");
5552 return NULL;
5553 }
5554 else
5555 return rsplit_substring(self,list,substring,maxcount);
5556}
5557
5558static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559PyObject *replace(PyUnicodeObject *self,
5560 PyUnicodeObject *str1,
5561 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005562 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563{
5564 PyUnicodeObject *u;
5565
5566 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005567 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568
Thomas Wouters477c8d52006-05-27 19:21:47 +00005569 if (str1->length == str2->length) {
5570 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005571 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005572 if (str1->length == 1) {
5573 /* replace characters */
5574 Py_UNICODE u1, u2;
5575 if (!findchar(self->str, self->length, str1->str[0]))
5576 goto nothing;
5577 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5578 if (!u)
5579 return NULL;
5580 Py_UNICODE_COPY(u->str, self->str, self->length);
5581 u1 = str1->str[0];
5582 u2 = str2->str[0];
5583 for (i = 0; i < u->length; i++)
5584 if (u->str[i] == u1) {
5585 if (--maxcount < 0)
5586 break;
5587 u->str[i] = u2;
5588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005590 i = fastsearch(
5591 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005593 if (i < 0)
5594 goto nothing;
5595 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5596 if (!u)
5597 return NULL;
5598 Py_UNICODE_COPY(u->str, self->str, self->length);
5599 while (i <= self->length - str1->length)
5600 if (Py_UNICODE_MATCH(self, i, str1)) {
5601 if (--maxcount < 0)
5602 break;
5603 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5604 i += str1->length;
5605 } else
5606 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005609
5610 Py_ssize_t n, i, j, e;
5611 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 Py_UNICODE *p;
5613
5614 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005615 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616 if (n > maxcount)
5617 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005618 if (n == 0)
5619 goto nothing;
5620 /* new_size = self->length + n * (str2->length - str1->length)); */
5621 delta = (str2->length - str1->length);
5622 if (delta == 0) {
5623 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005625 product = n * (str2->length - str1->length);
5626 if ((product / (str2->length - str1->length)) != n) {
5627 PyErr_SetString(PyExc_OverflowError,
5628 "replace string is too long");
5629 return NULL;
5630 }
5631 new_size = self->length + product;
5632 if (new_size < 0) {
5633 PyErr_SetString(PyExc_OverflowError,
5634 "replace string is too long");
5635 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636 }
5637 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005638 u = _PyUnicode_New(new_size);
5639 if (!u)
5640 return NULL;
5641 i = 0;
5642 p = u->str;
5643 e = self->length - str1->length;
5644 if (str1->length > 0) {
5645 while (n-- > 0) {
5646 /* look for next match */
5647 j = i;
5648 while (j <= e) {
5649 if (Py_UNICODE_MATCH(self, j, str1))
5650 break;
5651 j++;
5652 }
5653 if (j > i) {
5654 if (j > e)
5655 break;
5656 /* copy unchanged part [i:j] */
5657 Py_UNICODE_COPY(p, self->str+i, j-i);
5658 p += j - i;
5659 }
5660 /* copy substitution string */
5661 if (str2->length > 0) {
5662 Py_UNICODE_COPY(p, str2->str, str2->length);
5663 p += str2->length;
5664 }
5665 i = j + str1->length;
5666 }
5667 if (i < self->length)
5668 /* copy tail [i:] */
5669 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5670 } else {
5671 /* interleave */
5672 while (n > 0) {
5673 Py_UNICODE_COPY(p, str2->str, str2->length);
5674 p += str2->length;
5675 if (--n <= 0)
5676 break;
5677 *p++ = self->str[i++];
5678 }
5679 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5680 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005683
5684nothing:
5685 /* nothing to replace; return original string (when possible) */
5686 if (PyUnicode_CheckExact(self)) {
5687 Py_INCREF(self);
5688 return (PyObject *) self;
5689 }
5690 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691}
5692
5693/* --- Unicode Object Methods --------------------------------------------- */
5694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005695PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696"S.title() -> unicode\n\
5697\n\
5698Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005699characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700
5701static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005702unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 return fixup(self, fixtitle);
5705}
5706
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005707PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708"S.capitalize() -> unicode\n\
5709\n\
5710Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005711have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712
5713static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005714unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 return fixup(self, fixcapitalize);
5717}
5718
5719#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005720PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721"S.capwords() -> unicode\n\
5722\n\
5723Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005724normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725
5726static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005727unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728{
5729 PyObject *list;
5730 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005731 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733 /* Split into words */
5734 list = split(self, NULL, -1);
5735 if (!list)
5736 return NULL;
5737
5738 /* Capitalize each word */
5739 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5740 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5741 fixcapitalize);
5742 if (item == NULL)
5743 goto onError;
5744 Py_DECREF(PyList_GET_ITEM(list, i));
5745 PyList_SET_ITEM(list, i, item);
5746 }
5747
5748 /* Join the words to form a new string */
5749 item = PyUnicode_Join(NULL, list);
5750
5751onError:
5752 Py_DECREF(list);
5753 return (PyObject *)item;
5754}
5755#endif
5756
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005757/* Argument converter. Coerces to a single unicode character */
5758
5759static int
5760convert_uc(PyObject *obj, void *addr)
5761{
5762 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5763 PyObject *uniobj;
5764 Py_UNICODE *unistr;
5765
5766 uniobj = PyUnicode_FromObject(obj);
5767 if (uniobj == NULL) {
5768 PyErr_SetString(PyExc_TypeError,
5769 "The fill character cannot be converted to Unicode");
5770 return 0;
5771 }
5772 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5773 PyErr_SetString(PyExc_TypeError,
5774 "The fill character must be exactly one character long");
5775 Py_DECREF(uniobj);
5776 return 0;
5777 }
5778 unistr = PyUnicode_AS_UNICODE(uniobj);
5779 *fillcharloc = unistr[0];
5780 Py_DECREF(uniobj);
5781 return 1;
5782}
5783
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005784PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005785"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005787Return S centered in a Unicode string of length width. Padding is\n\
5788done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789
5790static PyObject *
5791unicode_center(PyUnicodeObject *self, PyObject *args)
5792{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005793 Py_ssize_t marg, left;
5794 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005795 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796
Thomas Woutersde017742006-02-16 19:34:37 +00005797 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798 return NULL;
5799
Tim Peters7a29bd52001-09-12 03:03:31 +00005800 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801 Py_INCREF(self);
5802 return (PyObject*) self;
5803 }
5804
5805 marg = width - self->length;
5806 left = marg / 2 + (marg & width & 1);
5807
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005808 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809}
5810
Marc-André Lemburge5034372000-08-08 08:04:29 +00005811#if 0
5812
5813/* This code should go into some future Unicode collation support
5814 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005815 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005816
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005817/* speedy UTF-16 code point order comparison */
5818/* gleaned from: */
5819/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5820
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005821static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005822{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005823 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005824 0, 0, 0, 0, 0, 0, 0, 0,
5825 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005826 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005827};
5828
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829static int
5830unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5831{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005832 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005833
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834 Py_UNICODE *s1 = str1->str;
5835 Py_UNICODE *s2 = str2->str;
5836
5837 len1 = str1->length;
5838 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005839
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005841 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005842
5843 c1 = *s1++;
5844 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005845
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005846 if (c1 > (1<<11) * 26)
5847 c1 += utf16Fixup[c1>>11];
5848 if (c2 > (1<<11) * 26)
5849 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005850 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005851
5852 if (c1 != c2)
5853 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005854
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005855 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 }
5857
5858 return (len1 < len2) ? -1 : (len1 != len2);
5859}
5860
Marc-André Lemburge5034372000-08-08 08:04:29 +00005861#else
5862
5863static int
5864unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5865{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005866 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005867
5868 Py_UNICODE *s1 = str1->str;
5869 Py_UNICODE *s2 = str2->str;
5870
5871 len1 = str1->length;
5872 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005873
Marc-André Lemburge5034372000-08-08 08:04:29 +00005874 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005875 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005876
Fredrik Lundh45714e92001-06-26 16:39:36 +00005877 c1 = *s1++;
5878 c2 = *s2++;
5879
5880 if (c1 != c2)
5881 return (c1 < c2) ? -1 : 1;
5882
Marc-André Lemburge5034372000-08-08 08:04:29 +00005883 len1--; len2--;
5884 }
5885
5886 return (len1 < len2) ? -1 : (len1 != len2);
5887}
5888
5889#endif
5890
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891int PyUnicode_Compare(PyObject *left,
5892 PyObject *right)
5893{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005894 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5895 return unicode_compare((PyUnicodeObject *)left,
5896 (PyUnicodeObject *)right);
5897 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5898 (PyUnicode_Check(left) && PyString_Check(right))) {
5899 if (PyUnicode_Check(left))
5900 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5901 if (PyUnicode_Check(right))
5902 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5903 assert(PyString_Check(left));
5904 assert(PyString_Check(right));
5905 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005907 PyErr_Format(PyExc_TypeError,
5908 "Can't compare %.100s and %.100s",
5909 left->ob_type->tp_name,
5910 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911 return -1;
5912}
5913
Martin v. Löwis5b222132007-06-10 09:51:05 +00005914int
5915PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
5916{
5917 int i;
5918 Py_UNICODE *id;
5919 assert(PyUnicode_Check(uni));
5920 id = PyUnicode_AS_UNICODE(uni);
5921 /* Compare Unicode string and source character set string */
5922 for (i = 0; id[i] && str[i]; i++)
5923 if (id[i] != str[i])
5924 return ((int)id[i] < (int)str[i]) ? -1 : 1;
5925 if (id[i])
5926 return 1; /* uni is longer */
5927 if (str[i])
5928 return -1; /* str is longer */
5929 return 0;
5930}
5931
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005932PyObject *PyUnicode_RichCompare(PyObject *left,
5933 PyObject *right,
5934 int op)
5935{
5936 int result;
5937
5938 result = PyUnicode_Compare(left, right);
5939 if (result == -1 && PyErr_Occurred())
5940 goto onError;
5941
5942 /* Convert the return value to a Boolean */
5943 switch (op) {
5944 case Py_EQ:
5945 result = (result == 0);
5946 break;
5947 case Py_NE:
5948 result = (result != 0);
5949 break;
5950 case Py_LE:
5951 result = (result <= 0);
5952 break;
5953 case Py_GE:
5954 result = (result >= 0);
5955 break;
5956 case Py_LT:
5957 result = (result == -1);
5958 break;
5959 case Py_GT:
5960 result = (result == 1);
5961 break;
5962 }
5963 return PyBool_FromLong(result);
5964
5965 onError:
5966
5967 /* Standard case
5968
5969 Type errors mean that PyUnicode_FromObject() could not convert
5970 one of the arguments (usually the right hand side) to Unicode,
5971 ie. we can't handle the comparison request. However, it is
5972 possible that the other object knows a comparison method, which
5973 is why we return Py_NotImplemented to give the other object a
5974 chance.
5975
5976 */
5977 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5978 PyErr_Clear();
5979 Py_INCREF(Py_NotImplemented);
5980 return Py_NotImplemented;
5981 }
5982 if (op != Py_EQ && op != Py_NE)
5983 return NULL;
5984
5985 /* Equality comparison.
5986
5987 This is a special case: we silence any PyExc_UnicodeDecodeError
5988 and instead turn it into a PyErr_UnicodeWarning.
5989
5990 */
5991 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5992 return NULL;
5993 PyErr_Clear();
5994 if (PyErr_Warn(PyExc_UnicodeWarning,
5995 (op == Py_EQ) ?
5996 "Unicode equal comparison "
5997 "failed to convert both arguments to Unicode - "
5998 "interpreting them as being unequal" :
5999 "Unicode unequal comparison "
6000 "failed to convert both arguments to Unicode - "
6001 "interpreting them as being unequal"
6002 ) < 0)
6003 return NULL;
6004 result = (op == Py_NE);
6005 return PyBool_FromLong(result);
6006}
6007
Guido van Rossum403d68b2000-03-13 15:55:09 +00006008int PyUnicode_Contains(PyObject *container,
6009 PyObject *element)
6010{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006011 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006012 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006013
6014 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006015 sub = PyUnicode_FromObject(element);
6016 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006017 PyErr_Format(PyExc_TypeError,
6018 "'in <string>' requires string as left operand, not %s",
6019 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006020 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006021 }
6022
Thomas Wouters477c8d52006-05-27 19:21:47 +00006023 str = PyUnicode_FromObject(container);
6024 if (!str) {
6025 Py_DECREF(sub);
6026 return -1;
6027 }
6028
6029 result = stringlib_contains_obj(str, sub);
6030
6031 Py_DECREF(str);
6032 Py_DECREF(sub);
6033
Guido van Rossum403d68b2000-03-13 15:55:09 +00006034 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006035}
6036
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037/* Concat to string or Unicode object giving a new Unicode object. */
6038
6039PyObject *PyUnicode_Concat(PyObject *left,
6040 PyObject *right)
6041{
6042 PyUnicodeObject *u = NULL, *v = NULL, *w;
6043
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006044 if (PyBytes_Check(left) || PyBytes_Check(right))
6045 return PyBytes_Concat(left, right);
6046
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 /* Coerce the two arguments */
6048 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6049 if (u == NULL)
6050 goto onError;
6051 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6052 if (v == NULL)
6053 goto onError;
6054
6055 /* Shortcuts */
6056 if (v == unicode_empty) {
6057 Py_DECREF(v);
6058 return (PyObject *)u;
6059 }
6060 if (u == unicode_empty) {
6061 Py_DECREF(u);
6062 return (PyObject *)v;
6063 }
6064
6065 /* Concat the two Unicode strings */
6066 w = _PyUnicode_New(u->length + v->length);
6067 if (w == NULL)
6068 goto onError;
6069 Py_UNICODE_COPY(w->str, u->str, u->length);
6070 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6071
6072 Py_DECREF(u);
6073 Py_DECREF(v);
6074 return (PyObject *)w;
6075
6076onError:
6077 Py_XDECREF(u);
6078 Py_XDECREF(v);
6079 return NULL;
6080}
6081
Walter Dörwald1ab83302007-05-18 17:15:44 +00006082void
6083PyUnicode_Append(PyObject **pleft, PyObject *right)
6084{
6085 PyObject *new;
6086 if (*pleft == NULL)
6087 return;
6088 if (right == NULL || !PyUnicode_Check(*pleft)) {
6089 Py_DECREF(*pleft);
6090 *pleft = NULL;
6091 return;
6092 }
6093 new = PyUnicode_Concat(*pleft, right);
6094 Py_DECREF(*pleft);
6095 *pleft = new;
6096}
6097
6098void
6099PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6100{
6101 PyUnicode_Append(pleft, right);
6102 Py_XDECREF(right);
6103}
6104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006105PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106"S.count(sub[, start[, end]]) -> int\n\
6107\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006108Return the number of non-overlapping occurrences of substring sub in\n\
6109Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006110interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111
6112static PyObject *
6113unicode_count(PyUnicodeObject *self, PyObject *args)
6114{
6115 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006116 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006117 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 PyObject *result;
6119
Guido van Rossumb8872e62000-05-09 14:14:27 +00006120 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6121 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 return NULL;
6123
6124 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006125 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 if (substring == NULL)
6127 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006128
Thomas Wouters477c8d52006-05-27 19:21:47 +00006129 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130
Thomas Wouters477c8d52006-05-27 19:21:47 +00006131 result = PyInt_FromSsize_t(
6132 stringlib_count(self->str + start, end - start,
6133 substring->str, substring->length)
6134 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135
6136 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006137
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 return result;
6139}
6140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006141PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006142"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006144Encodes S using the codec registered for encoding. encoding defaults\n\
6145to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006146handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6148'xmlcharrefreplace' as well as any other name registered with\n\
6149codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150
6151static PyObject *
6152unicode_encode(PyUnicodeObject *self, PyObject *args)
6153{
6154 char *encoding = NULL;
6155 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006156 PyObject *v;
6157
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6159 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006160 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006161 if (v == NULL)
6162 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006163 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006164 if (PyString_Check(v)) {
6165 /* Old codec, turn it into bytes */
6166 PyObject *b = PyBytes_FromObject(v);
6167 Py_DECREF(v);
6168 return b;
6169 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006170 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006171 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006172 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006173 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006174 Py_DECREF(v);
6175 return NULL;
6176 }
6177 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006178
6179 onError:
6180 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006181}
6182
6183PyDoc_STRVAR(decode__doc__,
6184"S.decode([encoding[,errors]]) -> string or unicode\n\
6185\n\
6186Decodes S using the codec registered for encoding. encoding defaults\n\
6187to the default encoding. errors may be given to set a different error\n\
6188handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6189a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6190as well as any other name registerd with codecs.register_error that is\n\
6191able to handle UnicodeDecodeErrors.");
6192
6193static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006194unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006195{
6196 char *encoding = NULL;
6197 char *errors = NULL;
6198 PyObject *v;
6199
6200 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6201 return NULL;
6202 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006203 if (v == NULL)
6204 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006205 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6206 PyErr_Format(PyExc_TypeError,
6207 "decoder did not return a string/unicode object "
6208 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006209 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006210 Py_DECREF(v);
6211 return NULL;
6212 }
6213 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006214
6215 onError:
6216 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217}
6218
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006219PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220"S.expandtabs([tabsize]) -> unicode\n\
6221\n\
6222Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006223If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224
6225static PyObject*
6226unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6227{
6228 Py_UNICODE *e;
6229 Py_UNICODE *p;
6230 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006231 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 PyUnicodeObject *u;
6233 int tabsize = 8;
6234
6235 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6236 return NULL;
6237
Thomas Wouters7e474022000-07-16 12:04:32 +00006238 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006239 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 e = self->str + self->length;
6241 for (p = self->str; p < e; p++)
6242 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006243 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006245 if (old_j > j) {
6246 PyErr_SetString(PyExc_OverflowError,
6247 "new string is too long");
6248 return NULL;
6249 }
6250 old_j = j;
6251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252 }
6253 else {
6254 j++;
6255 if (*p == '\n' || *p == '\r') {
6256 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006257 old_j = j = 0;
6258 if (i < 0) {
6259 PyErr_SetString(PyExc_OverflowError,
6260 "new string is too long");
6261 return NULL;
6262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 }
6264 }
6265
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006266 if ((i + j) < 0) {
6267 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6268 return NULL;
6269 }
6270
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 /* Second pass: create output string and fill it */
6272 u = _PyUnicode_New(i + j);
6273 if (!u)
6274 return NULL;
6275
6276 j = 0;
6277 q = u->str;
6278
6279 for (p = self->str; p < e; p++)
6280 if (*p == '\t') {
6281 if (tabsize > 0) {
6282 i = tabsize - (j % tabsize);
6283 j += i;
6284 while (i--)
6285 *q++ = ' ';
6286 }
6287 }
6288 else {
6289 j++;
6290 *q++ = *p;
6291 if (*p == '\n' || *p == '\r')
6292 j = 0;
6293 }
6294
6295 return (PyObject*) u;
6296}
6297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006298PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299"S.find(sub [,start [,end]]) -> int\n\
6300\n\
6301Return the lowest index in S where substring sub is found,\n\
6302such that sub is contained within s[start,end]. Optional\n\
6303arguments start and end are interpreted as in slice notation.\n\
6304\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006305Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306
6307static PyObject *
6308unicode_find(PyUnicodeObject *self, PyObject *args)
6309{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006310 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006311 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006312 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006313 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314
Guido van Rossumb8872e62000-05-09 14:14:27 +00006315 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6316 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006318 substring = PyUnicode_FromObject(substring);
6319 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320 return NULL;
6321
Thomas Wouters477c8d52006-05-27 19:21:47 +00006322 result = stringlib_find_slice(
6323 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6324 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6325 start, end
6326 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327
6328 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006329
6330 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331}
6332
6333static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006334unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335{
6336 if (index < 0 || index >= self->length) {
6337 PyErr_SetString(PyExc_IndexError, "string index out of range");
6338 return NULL;
6339 }
6340
6341 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6342}
6343
6344static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006345unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006347 /* Since Unicode objects compare equal to their UTF-8 string
6348 counterparts, we hash the UTF-8 string. */
6349 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6350 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351}
6352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006353PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354"S.index(sub [,start [,end]]) -> int\n\
6355\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006356Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357
6358static PyObject *
6359unicode_index(PyUnicodeObject *self, PyObject *args)
6360{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006361 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006362 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006363 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006364 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365
Guido van Rossumb8872e62000-05-09 14:14:27 +00006366 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6367 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006369 substring = PyUnicode_FromObject(substring);
6370 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 return NULL;
6372
Thomas Wouters477c8d52006-05-27 19:21:47 +00006373 result = stringlib_find_slice(
6374 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6375 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6376 start, end
6377 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378
6379 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006380
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 if (result < 0) {
6382 PyErr_SetString(PyExc_ValueError, "substring not found");
6383 return NULL;
6384 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006385
Martin v. Löwis18e16552006-02-15 17:27:45 +00006386 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387}
6388
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006389PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006390"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006392Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006393at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394
6395static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006396unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397{
6398 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6399 register const Py_UNICODE *e;
6400 int cased;
6401
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402 /* Shortcut for single character strings */
6403 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006404 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006406 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006407 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006408 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006409
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 e = p + PyUnicode_GET_SIZE(self);
6411 cased = 0;
6412 for (; p < e; p++) {
6413 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006414
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006416 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417 else if (!cased && Py_UNICODE_ISLOWER(ch))
6418 cased = 1;
6419 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006420 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421}
6422
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006423PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006424"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006426Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006427at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428
6429static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006430unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431{
6432 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6433 register const Py_UNICODE *e;
6434 int cased;
6435
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 /* Shortcut for single character strings */
6437 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006438 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006440 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006441 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006442 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006443
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 e = p + PyUnicode_GET_SIZE(self);
6445 cased = 0;
6446 for (; p < e; p++) {
6447 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006448
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006450 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 else if (!cased && Py_UNICODE_ISUPPER(ch))
6452 cased = 1;
6453 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006454 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455}
6456
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006457PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006458"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006460Return True if S is a titlecased string and there is at least one\n\
6461character in S, i.e. upper- and titlecase characters may only\n\
6462follow uncased characters and lowercase characters only cased ones.\n\
6463Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464
6465static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006466unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467{
6468 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6469 register const Py_UNICODE *e;
6470 int cased, previous_is_cased;
6471
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 /* Shortcut for single character strings */
6473 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006474 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6475 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006477 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006478 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006479 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006480
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 e = p + PyUnicode_GET_SIZE(self);
6482 cased = 0;
6483 previous_is_cased = 0;
6484 for (; p < e; p++) {
6485 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006486
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6488 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006489 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 previous_is_cased = 1;
6491 cased = 1;
6492 }
6493 else if (Py_UNICODE_ISLOWER(ch)) {
6494 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006495 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 previous_is_cased = 1;
6497 cased = 1;
6498 }
6499 else
6500 previous_is_cased = 0;
6501 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006502 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503}
6504
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006505PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006506"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006508Return True if all characters in S are whitespace\n\
6509and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510
6511static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006512unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513{
6514 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6515 register const Py_UNICODE *e;
6516
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 /* Shortcut for single character strings */
6518 if (PyUnicode_GET_SIZE(self) == 1 &&
6519 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006520 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006522 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006523 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006524 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006525
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 e = p + PyUnicode_GET_SIZE(self);
6527 for (; p < e; p++) {
6528 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006529 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006531 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532}
6533
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006534PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006535"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006536\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006537Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006538and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006539
6540static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006541unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006542{
6543 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6544 register const Py_UNICODE *e;
6545
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006546 /* Shortcut for single character strings */
6547 if (PyUnicode_GET_SIZE(self) == 1 &&
6548 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006549 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006550
6551 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006552 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006553 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006554
6555 e = p + PyUnicode_GET_SIZE(self);
6556 for (; p < e; p++) {
6557 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006558 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006559 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006560 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006561}
6562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006563PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006564"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006565\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006566Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006567and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006568
6569static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006570unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006571{
6572 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6573 register const Py_UNICODE *e;
6574
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006575 /* Shortcut for single character strings */
6576 if (PyUnicode_GET_SIZE(self) == 1 &&
6577 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006578 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006579
6580 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006581 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006582 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006583
6584 e = p + PyUnicode_GET_SIZE(self);
6585 for (; p < e; p++) {
6586 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006587 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006588 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006589 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006590}
6591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006592PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006593"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006595Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006596False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597
6598static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006599unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600{
6601 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6602 register const Py_UNICODE *e;
6603
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604 /* Shortcut for single character strings */
6605 if (PyUnicode_GET_SIZE(self) == 1 &&
6606 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006607 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006609 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006610 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006611 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006612
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 e = p + PyUnicode_GET_SIZE(self);
6614 for (; p < e; p++) {
6615 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006616 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006618 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619}
6620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006621PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006622"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006624Return True if all characters in S are digits\n\
6625and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626
6627static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006628unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629{
6630 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6631 register const Py_UNICODE *e;
6632
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 /* Shortcut for single character strings */
6634 if (PyUnicode_GET_SIZE(self) == 1 &&
6635 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006636 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006638 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006639 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006640 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006641
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 e = p + PyUnicode_GET_SIZE(self);
6643 for (; p < e; p++) {
6644 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006645 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006647 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648}
6649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006650PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006651"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006653Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006654False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655
6656static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006657unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658{
6659 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6660 register const Py_UNICODE *e;
6661
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 /* Shortcut for single character strings */
6663 if (PyUnicode_GET_SIZE(self) == 1 &&
6664 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006665 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006667 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006668 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006669 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006670
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 e = p + PyUnicode_GET_SIZE(self);
6672 for (; p < e; p++) {
6673 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006674 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006676 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677}
6678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006679PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680"S.join(sequence) -> unicode\n\
6681\n\
6682Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006683sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684
6685static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006686unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006688 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689}
6690
Martin v. Löwis18e16552006-02-15 17:27:45 +00006691static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692unicode_length(PyUnicodeObject *self)
6693{
6694 return self->length;
6695}
6696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006697PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006698"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699\n\
6700Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006701done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702
6703static PyObject *
6704unicode_ljust(PyUnicodeObject *self, PyObject *args)
6705{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006706 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006707 Py_UNICODE fillchar = ' ';
6708
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006709 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710 return NULL;
6711
Tim Peters7a29bd52001-09-12 03:03:31 +00006712 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 Py_INCREF(self);
6714 return (PyObject*) self;
6715 }
6716
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006717 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718}
6719
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006720PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721"S.lower() -> unicode\n\
6722\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006723Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724
6725static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006726unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728 return fixup(self, fixlower);
6729}
6730
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006731#define LEFTSTRIP 0
6732#define RIGHTSTRIP 1
6733#define BOTHSTRIP 2
6734
6735/* Arrays indexed by above */
6736static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6737
6738#define STRIPNAME(i) (stripformat[i]+3)
6739
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006740/* externally visible for str.strip(unicode) */
6741PyObject *
6742_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6743{
6744 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006745 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006746 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006747 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6748 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006749
Thomas Wouters477c8d52006-05-27 19:21:47 +00006750 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6751
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006752 i = 0;
6753 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006754 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6755 i++;
6756 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006757 }
6758
6759 j = len;
6760 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006761 do {
6762 j--;
6763 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6764 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006765 }
6766
6767 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006768 Py_INCREF(self);
6769 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006770 }
6771 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006772 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006773}
6774
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775
6776static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006777do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006779 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006780 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006781
6782 i = 0;
6783 if (striptype != RIGHTSTRIP) {
6784 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6785 i++;
6786 }
6787 }
6788
6789 j = len;
6790 if (striptype != LEFTSTRIP) {
6791 do {
6792 j--;
6793 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6794 j++;
6795 }
6796
6797 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6798 Py_INCREF(self);
6799 return (PyObject*)self;
6800 }
6801 else
6802 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803}
6804
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006805
6806static PyObject *
6807do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6808{
6809 PyObject *sep = NULL;
6810
6811 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6812 return NULL;
6813
6814 if (sep != NULL && sep != Py_None) {
6815 if (PyUnicode_Check(sep))
6816 return _PyUnicode_XStrip(self, striptype, sep);
6817 else if (PyString_Check(sep)) {
6818 PyObject *res;
6819 sep = PyUnicode_FromObject(sep);
6820 if (sep==NULL)
6821 return NULL;
6822 res = _PyUnicode_XStrip(self, striptype, sep);
6823 Py_DECREF(sep);
6824 return res;
6825 }
6826 else {
6827 PyErr_Format(PyExc_TypeError,
6828 "%s arg must be None, unicode or str",
6829 STRIPNAME(striptype));
6830 return NULL;
6831 }
6832 }
6833
6834 return do_strip(self, striptype);
6835}
6836
6837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006838PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006839"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006840\n\
6841Return a copy of the string S with leading and trailing\n\
6842whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006843If chars is given and not None, remove characters in chars instead.\n\
6844If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006845
6846static PyObject *
6847unicode_strip(PyUnicodeObject *self, PyObject *args)
6848{
6849 if (PyTuple_GET_SIZE(args) == 0)
6850 return do_strip(self, BOTHSTRIP); /* Common case */
6851 else
6852 return do_argstrip(self, BOTHSTRIP, args);
6853}
6854
6855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006856PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006857"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006858\n\
6859Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006860If chars is given and not None, remove characters in chars instead.\n\
6861If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006862
6863static PyObject *
6864unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6865{
6866 if (PyTuple_GET_SIZE(args) == 0)
6867 return do_strip(self, LEFTSTRIP); /* Common case */
6868 else
6869 return do_argstrip(self, LEFTSTRIP, args);
6870}
6871
6872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006873PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006874"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006875\n\
6876Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006877If chars is given and not None, remove characters in chars instead.\n\
6878If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006879
6880static PyObject *
6881unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6882{
6883 if (PyTuple_GET_SIZE(args) == 0)
6884 return do_strip(self, RIGHTSTRIP); /* Common case */
6885 else
6886 return do_argstrip(self, RIGHTSTRIP, args);
6887}
6888
6889
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006891unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892{
6893 PyUnicodeObject *u;
6894 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006895 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006896 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897
6898 if (len < 0)
6899 len = 0;
6900
Tim Peters7a29bd52001-09-12 03:03:31 +00006901 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902 /* no repeat, return original string */
6903 Py_INCREF(str);
6904 return (PyObject*) str;
6905 }
Tim Peters8f422462000-09-09 06:13:41 +00006906
6907 /* ensure # of chars needed doesn't overflow int and # of bytes
6908 * needed doesn't overflow size_t
6909 */
6910 nchars = len * str->length;
6911 if (len && nchars / len != str->length) {
6912 PyErr_SetString(PyExc_OverflowError,
6913 "repeated string is too long");
6914 return NULL;
6915 }
6916 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6917 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6918 PyErr_SetString(PyExc_OverflowError,
6919 "repeated string is too long");
6920 return NULL;
6921 }
6922 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923 if (!u)
6924 return NULL;
6925
6926 p = u->str;
6927
Thomas Wouters477c8d52006-05-27 19:21:47 +00006928 if (str->length == 1 && len > 0) {
6929 Py_UNICODE_FILL(p, str->str[0], len);
6930 } else {
6931 Py_ssize_t done = 0; /* number of characters copied this far */
6932 if (done < nchars) {
6933 Py_UNICODE_COPY(p, str->str, str->length);
6934 done = str->length;
6935 }
6936 while (done < nchars) {
6937 int n = (done <= nchars-done) ? done : nchars-done;
6938 Py_UNICODE_COPY(p+done, p, n);
6939 done += n;
6940 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 }
6942
6943 return (PyObject*) u;
6944}
6945
6946PyObject *PyUnicode_Replace(PyObject *obj,
6947 PyObject *subobj,
6948 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006949 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950{
6951 PyObject *self;
6952 PyObject *str1;
6953 PyObject *str2;
6954 PyObject *result;
6955
6956 self = PyUnicode_FromObject(obj);
6957 if (self == NULL)
6958 return NULL;
6959 str1 = PyUnicode_FromObject(subobj);
6960 if (str1 == NULL) {
6961 Py_DECREF(self);
6962 return NULL;
6963 }
6964 str2 = PyUnicode_FromObject(replobj);
6965 if (str2 == NULL) {
6966 Py_DECREF(self);
6967 Py_DECREF(str1);
6968 return NULL;
6969 }
Tim Petersced69f82003-09-16 20:30:58 +00006970 result = replace((PyUnicodeObject *)self,
6971 (PyUnicodeObject *)str1,
6972 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973 maxcount);
6974 Py_DECREF(self);
6975 Py_DECREF(str1);
6976 Py_DECREF(str2);
6977 return result;
6978}
6979
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006980PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981"S.replace (old, new[, maxsplit]) -> unicode\n\
6982\n\
6983Return a copy of S with all occurrences of substring\n\
6984old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006985given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986
6987static PyObject*
6988unicode_replace(PyUnicodeObject *self, PyObject *args)
6989{
6990 PyUnicodeObject *str1;
6991 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006992 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993 PyObject *result;
6994
Martin v. Löwis18e16552006-02-15 17:27:45 +00006995 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996 return NULL;
6997 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6998 if (str1 == NULL)
6999 return NULL;
7000 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007001 if (str2 == NULL) {
7002 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005
7006 result = replace(self, str1, str2, maxcount);
7007
7008 Py_DECREF(str1);
7009 Py_DECREF(str2);
7010 return result;
7011}
7012
7013static
7014PyObject *unicode_repr(PyObject *unicode)
7015{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007016 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007017 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007018 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7019 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7020
7021 /* XXX(nnorwitz): rather than over-allocating, it would be
7022 better to choose a different scheme. Perhaps scan the
7023 first N-chars of the string and allocate based on that size.
7024 */
7025 /* Initial allocation is based on the longest-possible unichr
7026 escape.
7027
7028 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7029 unichr, so in this case it's the longest unichr escape. In
7030 narrow (UTF-16) builds this is five chars per source unichr
7031 since there are two unichrs in the surrogate pair, so in narrow
7032 (UTF-16) builds it's not the longest unichr escape.
7033
7034 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7035 so in the narrow (UTF-16) build case it's the longest unichr
7036 escape.
7037 */
7038
Walter Dörwald1ab83302007-05-18 17:15:44 +00007039 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007040 2 /* quotes */
7041#ifdef Py_UNICODE_WIDE
7042 + 10*size
7043#else
7044 + 6*size
7045#endif
7046 + 1);
7047 if (repr == NULL)
7048 return NULL;
7049
Walter Dörwald1ab83302007-05-18 17:15:44 +00007050 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007051
7052 /* Add quote */
7053 *p++ = (findchar(s, size, '\'') &&
7054 !findchar(s, size, '"')) ? '"' : '\'';
7055 while (size-- > 0) {
7056 Py_UNICODE ch = *s++;
7057
7058 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007059 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007060 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007061 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007062 continue;
7063 }
7064
7065#ifdef Py_UNICODE_WIDE
7066 /* Map 21-bit characters to '\U00xxxxxx' */
7067 else if (ch >= 0x10000) {
7068 *p++ = '\\';
7069 *p++ = 'U';
7070 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7071 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7072 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7073 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7074 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7075 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7076 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7077 *p++ = hexdigits[ch & 0x0000000F];
7078 continue;
7079 }
7080#else
7081 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7082 else if (ch >= 0xD800 && ch < 0xDC00) {
7083 Py_UNICODE ch2;
7084 Py_UCS4 ucs;
7085
7086 ch2 = *s++;
7087 size--;
7088 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7089 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7090 *p++ = '\\';
7091 *p++ = 'U';
7092 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7093 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7094 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7095 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7096 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7097 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7098 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7099 *p++ = hexdigits[ucs & 0x0000000F];
7100 continue;
7101 }
7102 /* Fall through: isolated surrogates are copied as-is */
7103 s--;
7104 size++;
7105 }
7106#endif
7107
7108 /* Map 16-bit characters to '\uxxxx' */
7109 if (ch >= 256) {
7110 *p++ = '\\';
7111 *p++ = 'u';
7112 *p++ = hexdigits[(ch >> 12) & 0x000F];
7113 *p++ = hexdigits[(ch >> 8) & 0x000F];
7114 *p++ = hexdigits[(ch >> 4) & 0x000F];
7115 *p++ = hexdigits[ch & 0x000F];
7116 }
7117
7118 /* Map special whitespace to '\t', \n', '\r' */
7119 else if (ch == '\t') {
7120 *p++ = '\\';
7121 *p++ = 't';
7122 }
7123 else if (ch == '\n') {
7124 *p++ = '\\';
7125 *p++ = 'n';
7126 }
7127 else if (ch == '\r') {
7128 *p++ = '\\';
7129 *p++ = 'r';
7130 }
7131
7132 /* Map non-printable US ASCII to '\xhh' */
7133 else if (ch < ' ' || ch >= 0x7F) {
7134 *p++ = '\\';
7135 *p++ = 'x';
7136 *p++ = hexdigits[(ch >> 4) & 0x000F];
7137 *p++ = hexdigits[ch & 0x000F];
7138 }
7139
7140 /* Copy everything else as-is */
7141 else
7142 *p++ = (char) ch;
7143 }
7144 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007145 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007146
7147 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007148 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007149 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150}
7151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007152PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153"S.rfind(sub [,start [,end]]) -> int\n\
7154\n\
7155Return the highest index in S where substring sub is found,\n\
7156such that sub is contained within s[start,end]. Optional\n\
7157arguments start and end are interpreted as in slice notation.\n\
7158\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007159Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160
7161static PyObject *
7162unicode_rfind(PyUnicodeObject *self, PyObject *args)
7163{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007164 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007165 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007166 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007167 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168
Guido van Rossumb8872e62000-05-09 14:14:27 +00007169 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7170 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007172 substring = PyUnicode_FromObject(substring);
7173 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174 return NULL;
7175
Thomas Wouters477c8d52006-05-27 19:21:47 +00007176 result = stringlib_rfind_slice(
7177 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7178 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7179 start, end
7180 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181
7182 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007183
7184 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185}
7186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007187PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188"S.rindex(sub [,start [,end]]) -> int\n\
7189\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007190Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191
7192static PyObject *
7193unicode_rindex(PyUnicodeObject *self, PyObject *args)
7194{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007195 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007196 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007197 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007198 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199
Guido van Rossumb8872e62000-05-09 14:14:27 +00007200 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7201 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007203 substring = PyUnicode_FromObject(substring);
7204 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 return NULL;
7206
Thomas Wouters477c8d52006-05-27 19:21:47 +00007207 result = stringlib_rfind_slice(
7208 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7209 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7210 start, end
7211 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212
7213 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007214
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 if (result < 0) {
7216 PyErr_SetString(PyExc_ValueError, "substring not found");
7217 return NULL;
7218 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007219 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220}
7221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007222PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007223"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224\n\
7225Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007226done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227
7228static PyObject *
7229unicode_rjust(PyUnicodeObject *self, PyObject *args)
7230{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007231 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007232 Py_UNICODE fillchar = ' ';
7233
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007234 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235 return NULL;
7236
Tim Peters7a29bd52001-09-12 03:03:31 +00007237 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238 Py_INCREF(self);
7239 return (PyObject*) self;
7240 }
7241
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007242 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243}
7244
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007246unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247{
7248 /* standard clamping */
7249 if (start < 0)
7250 start = 0;
7251 if (end < 0)
7252 end = 0;
7253 if (end > self->length)
7254 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007255 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256 /* full slice, return original string */
7257 Py_INCREF(self);
7258 return (PyObject*) self;
7259 }
7260 if (start > end)
7261 start = end;
7262 /* copy slice */
7263 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7264 end - start);
7265}
7266
7267PyObject *PyUnicode_Split(PyObject *s,
7268 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007269 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270{
7271 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007272
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273 s = PyUnicode_FromObject(s);
7274 if (s == NULL)
7275 return NULL;
7276 if (sep != NULL) {
7277 sep = PyUnicode_FromObject(sep);
7278 if (sep == NULL) {
7279 Py_DECREF(s);
7280 return NULL;
7281 }
7282 }
7283
7284 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7285
7286 Py_DECREF(s);
7287 Py_XDECREF(sep);
7288 return result;
7289}
7290
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007291PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292"S.split([sep [,maxsplit]]) -> list of strings\n\
7293\n\
7294Return a list of the words in S, using sep as the\n\
7295delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007296splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007297any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298
7299static PyObject*
7300unicode_split(PyUnicodeObject *self, PyObject *args)
7301{
7302 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007303 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304
Martin v. Löwis18e16552006-02-15 17:27:45 +00007305 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306 return NULL;
7307
7308 if (substring == Py_None)
7309 return split(self, NULL, maxcount);
7310 else if (PyUnicode_Check(substring))
7311 return split(self, (PyUnicodeObject *)substring, maxcount);
7312 else
7313 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7314}
7315
Thomas Wouters477c8d52006-05-27 19:21:47 +00007316PyObject *
7317PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7318{
7319 PyObject* str_obj;
7320 PyObject* sep_obj;
7321 PyObject* out;
7322
7323 str_obj = PyUnicode_FromObject(str_in);
7324 if (!str_obj)
7325 return NULL;
7326 sep_obj = PyUnicode_FromObject(sep_in);
7327 if (!sep_obj) {
7328 Py_DECREF(str_obj);
7329 return NULL;
7330 }
7331
7332 out = stringlib_partition(
7333 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7334 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7335 );
7336
7337 Py_DECREF(sep_obj);
7338 Py_DECREF(str_obj);
7339
7340 return out;
7341}
7342
7343
7344PyObject *
7345PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7346{
7347 PyObject* str_obj;
7348 PyObject* sep_obj;
7349 PyObject* out;
7350
7351 str_obj = PyUnicode_FromObject(str_in);
7352 if (!str_obj)
7353 return NULL;
7354 sep_obj = PyUnicode_FromObject(sep_in);
7355 if (!sep_obj) {
7356 Py_DECREF(str_obj);
7357 return NULL;
7358 }
7359
7360 out = stringlib_rpartition(
7361 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7362 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7363 );
7364
7365 Py_DECREF(sep_obj);
7366 Py_DECREF(str_obj);
7367
7368 return out;
7369}
7370
7371PyDoc_STRVAR(partition__doc__,
7372"S.partition(sep) -> (head, sep, tail)\n\
7373\n\
7374Searches for the separator sep in S, and returns the part before it,\n\
7375the separator itself, and the part after it. If the separator is not\n\
7376found, returns S and two empty strings.");
7377
7378static PyObject*
7379unicode_partition(PyUnicodeObject *self, PyObject *separator)
7380{
7381 return PyUnicode_Partition((PyObject *)self, separator);
7382}
7383
7384PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007385"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007386\n\
7387Searches for the separator sep in S, starting at the end of S, and returns\n\
7388the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007389separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007390
7391static PyObject*
7392unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7393{
7394 return PyUnicode_RPartition((PyObject *)self, separator);
7395}
7396
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007397PyObject *PyUnicode_RSplit(PyObject *s,
7398 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007399 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007400{
7401 PyObject *result;
7402
7403 s = PyUnicode_FromObject(s);
7404 if (s == NULL)
7405 return NULL;
7406 if (sep != NULL) {
7407 sep = PyUnicode_FromObject(sep);
7408 if (sep == NULL) {
7409 Py_DECREF(s);
7410 return NULL;
7411 }
7412 }
7413
7414 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7415
7416 Py_DECREF(s);
7417 Py_XDECREF(sep);
7418 return result;
7419}
7420
7421PyDoc_STRVAR(rsplit__doc__,
7422"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7423\n\
7424Return a list of the words in S, using sep as the\n\
7425delimiter string, starting at the end of the string and\n\
7426working to the front. If maxsplit is given, at most maxsplit\n\
7427splits are done. If sep is not specified, any whitespace string\n\
7428is a separator.");
7429
7430static PyObject*
7431unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7432{
7433 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007434 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007435
Martin v. Löwis18e16552006-02-15 17:27:45 +00007436 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007437 return NULL;
7438
7439 if (substring == Py_None)
7440 return rsplit(self, NULL, maxcount);
7441 else if (PyUnicode_Check(substring))
7442 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7443 else
7444 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7445}
7446
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007447PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007448"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449\n\
7450Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007451Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007452is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453
7454static PyObject*
7455unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7456{
Guido van Rossum86662912000-04-11 15:38:46 +00007457 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458
Guido van Rossum86662912000-04-11 15:38:46 +00007459 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460 return NULL;
7461
Guido van Rossum86662912000-04-11 15:38:46 +00007462 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463}
7464
7465static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007466PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467{
Walter Dörwald346737f2007-05-31 10:44:43 +00007468 if (PyUnicode_CheckExact(self)) {
7469 Py_INCREF(self);
7470 return self;
7471 } else
7472 /* Subtype -- return genuine unicode string with the same value. */
7473 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7474 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475}
7476
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007477PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478"S.swapcase() -> unicode\n\
7479\n\
7480Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007481and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482
7483static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007484unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486 return fixup(self, fixswapcase);
7487}
7488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007489PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490"S.translate(table) -> unicode\n\
7491\n\
7492Return a copy of the string S, where all characters have been mapped\n\
7493through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007494Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7495Unmapped characters are left untouched. Characters mapped to None\n\
7496are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497
7498static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007499unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500{
Tim Petersced69f82003-09-16 20:30:58 +00007501 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007503 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504 "ignore");
7505}
7506
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007507PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508"S.upper() -> unicode\n\
7509\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007510Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511
7512static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007513unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515 return fixup(self, fixupper);
7516}
7517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007518PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519"S.zfill(width) -> unicode\n\
7520\n\
7521Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007522of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523
7524static PyObject *
7525unicode_zfill(PyUnicodeObject *self, PyObject *args)
7526{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007527 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528 PyUnicodeObject *u;
7529
Martin v. Löwis18e16552006-02-15 17:27:45 +00007530 Py_ssize_t width;
7531 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 return NULL;
7533
7534 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007535 if (PyUnicode_CheckExact(self)) {
7536 Py_INCREF(self);
7537 return (PyObject*) self;
7538 }
7539 else
7540 return PyUnicode_FromUnicode(
7541 PyUnicode_AS_UNICODE(self),
7542 PyUnicode_GET_SIZE(self)
7543 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544 }
7545
7546 fill = width - self->length;
7547
7548 u = pad(self, fill, 0, '0');
7549
Walter Dörwald068325e2002-04-15 13:36:47 +00007550 if (u == NULL)
7551 return NULL;
7552
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553 if (u->str[fill] == '+' || u->str[fill] == '-') {
7554 /* move sign to beginning of string */
7555 u->str[0] = u->str[fill];
7556 u->str[fill] = '0';
7557 }
7558
7559 return (PyObject*) u;
7560}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561
7562#if 0
7563static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007564unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566 return PyInt_FromLong(unicode_freelist_size);
7567}
7568#endif
7569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007570PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007571"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007573Return True if S starts with the specified prefix, False otherwise.\n\
7574With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007575With optional end, stop comparing S at that position.\n\
7576prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577
7578static PyObject *
7579unicode_startswith(PyUnicodeObject *self,
7580 PyObject *args)
7581{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007582 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007584 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007585 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007586 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007588 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007589 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007591 if (PyTuple_Check(subobj)) {
7592 Py_ssize_t i;
7593 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7594 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7595 PyTuple_GET_ITEM(subobj, i));
7596 if (substring == NULL)
7597 return NULL;
7598 result = tailmatch(self, substring, start, end, -1);
7599 Py_DECREF(substring);
7600 if (result) {
7601 Py_RETURN_TRUE;
7602 }
7603 }
7604 /* nothing matched */
7605 Py_RETURN_FALSE;
7606 }
7607 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007609 return NULL;
7610 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007612 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613}
7614
7615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007616PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007617"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007619Return True if S ends with the specified suffix, False otherwise.\n\
7620With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007621With optional end, stop comparing S at that position.\n\
7622suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623
7624static PyObject *
7625unicode_endswith(PyUnicodeObject *self,
7626 PyObject *args)
7627{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007628 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007630 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007631 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007632 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007634 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7635 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007637 if (PyTuple_Check(subobj)) {
7638 Py_ssize_t i;
7639 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7640 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7641 PyTuple_GET_ITEM(subobj, i));
7642 if (substring == NULL)
7643 return NULL;
7644 result = tailmatch(self, substring, start, end, +1);
7645 Py_DECREF(substring);
7646 if (result) {
7647 Py_RETURN_TRUE;
7648 }
7649 }
7650 Py_RETURN_FALSE;
7651 }
7652 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007654 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007656 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007658 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659}
7660
7661
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007662
7663static PyObject *
7664unicode_getnewargs(PyUnicodeObject *v)
7665{
7666 return Py_BuildValue("(u#)", v->str, v->length);
7667}
7668
7669
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670static PyMethodDef unicode_methods[] = {
7671
7672 /* Order is according to common usage: often used methods should
7673 appear first, since lookup is done sequentially. */
7674
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007675 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7676 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7677 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007678 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007679 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7680 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7681 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7682 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7683 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7684 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7685 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007686 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007687 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7688 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7689 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007690 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007691 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007692/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7693 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7694 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7695 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007696 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007697 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007698 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007699 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007700 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7701 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7702 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7703 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7704 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7705 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7706 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7707 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7708 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7709 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7710 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7711 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7712 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7713 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007714 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007715#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007716 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717#endif
7718
7719#if 0
7720 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007721 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722#endif
7723
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007724 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725 {NULL, NULL}
7726};
7727
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007728static PyObject *
7729unicode_mod(PyObject *v, PyObject *w)
7730{
7731 if (!PyUnicode_Check(v)) {
7732 Py_INCREF(Py_NotImplemented);
7733 return Py_NotImplemented;
7734 }
7735 return PyUnicode_Format(v, w);
7736}
7737
7738static PyNumberMethods unicode_as_number = {
7739 0, /*nb_add*/
7740 0, /*nb_subtract*/
7741 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007742 unicode_mod, /*nb_remainder*/
7743};
7744
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007746 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007747 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007748 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7749 (ssizeargfunc) unicode_getitem, /* sq_item */
7750 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751 0, /* sq_ass_item */
7752 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007753 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754};
7755
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007756static PyObject*
7757unicode_subscript(PyUnicodeObject* self, PyObject* item)
7758{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007759 if (PyIndex_Check(item)) {
7760 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007761 if (i == -1 && PyErr_Occurred())
7762 return NULL;
7763 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007764 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007765 return unicode_getitem(self, i);
7766 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007767 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007768 Py_UNICODE* source_buf;
7769 Py_UNICODE* result_buf;
7770 PyObject* result;
7771
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007772 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007773 &start, &stop, &step, &slicelength) < 0) {
7774 return NULL;
7775 }
7776
7777 if (slicelength <= 0) {
7778 return PyUnicode_FromUnicode(NULL, 0);
7779 } else {
7780 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007781 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7782 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007783
7784 if (result_buf == NULL)
7785 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007786
7787 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7788 result_buf[i] = source_buf[cur];
7789 }
Tim Petersced69f82003-09-16 20:30:58 +00007790
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007791 result = PyUnicode_FromUnicode(result_buf, slicelength);
7792 PyMem_FREE(result_buf);
7793 return result;
7794 }
7795 } else {
7796 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7797 return NULL;
7798 }
7799}
7800
7801static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007802 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007803 (binaryfunc)unicode_subscript, /* mp_subscript */
7804 (objobjargproc)0, /* mp_ass_subscript */
7805};
7806
Martin v. Löwis18e16552006-02-15 17:27:45 +00007807static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007809 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810 const void **ptr)
7811{
7812 if (index != 0) {
7813 PyErr_SetString(PyExc_SystemError,
7814 "accessing non-existent unicode segment");
7815 return -1;
7816 }
7817 *ptr = (void *) self->str;
7818 return PyUnicode_GET_DATA_SIZE(self);
7819}
7820
Martin v. Löwis18e16552006-02-15 17:27:45 +00007821static Py_ssize_t
7822unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823 const void **ptr)
7824{
7825 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007826 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007827 return -1;
7828}
7829
7830static int
7831unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007832 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833{
7834 if (lenp)
7835 *lenp = PyUnicode_GET_DATA_SIZE(self);
7836 return 1;
7837}
7838
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007839static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007841 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842 const void **ptr)
7843{
7844 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007845
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846 if (index != 0) {
7847 PyErr_SetString(PyExc_SystemError,
7848 "accessing non-existent unicode segment");
7849 return -1;
7850 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007851 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852 if (str == NULL)
7853 return -1;
7854 *ptr = (void *) PyString_AS_STRING(str);
7855 return PyString_GET_SIZE(str);
7856}
7857
7858/* Helpers for PyUnicode_Format() */
7859
7860static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007861getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007863 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864 if (argidx < arglen) {
7865 (*p_argidx)++;
7866 if (arglen < 0)
7867 return args;
7868 else
7869 return PyTuple_GetItem(args, argidx);
7870 }
7871 PyErr_SetString(PyExc_TypeError,
7872 "not enough arguments for format string");
7873 return NULL;
7874}
7875
7876#define F_LJUST (1<<0)
7877#define F_SIGN (1<<1)
7878#define F_BLANK (1<<2)
7879#define F_ALT (1<<3)
7880#define F_ZERO (1<<4)
7881
Martin v. Löwis18e16552006-02-15 17:27:45 +00007882static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007883strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007885 register Py_ssize_t i;
7886 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887 for (i = len - 1; i >= 0; i--)
7888 buffer[i] = (Py_UNICODE) charbuffer[i];
7889
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890 return len;
7891}
7892
Neal Norwitzfc76d632006-01-10 06:03:13 +00007893static int
7894doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7895{
Tim Peters15231542006-02-16 01:08:01 +00007896 Py_ssize_t result;
7897
Neal Norwitzfc76d632006-01-10 06:03:13 +00007898 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007899 result = strtounicode(buffer, (char *)buffer);
7900 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007901}
7902
7903static int
7904longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7905{
Tim Peters15231542006-02-16 01:08:01 +00007906 Py_ssize_t result;
7907
Neal Norwitzfc76d632006-01-10 06:03:13 +00007908 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007909 result = strtounicode(buffer, (char *)buffer);
7910 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007911}
7912
Guido van Rossum078151d2002-08-11 04:24:12 +00007913/* XXX To save some code duplication, formatfloat/long/int could have been
7914 shared with stringobject.c, converting from 8-bit to Unicode after the
7915 formatting is done. */
7916
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917static int
7918formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007919 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007920 int flags,
7921 int prec,
7922 int type,
7923 PyObject *v)
7924{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007925 /* fmt = '%#.' + `prec` + `type`
7926 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927 char fmt[20];
7928 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007929
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930 x = PyFloat_AsDouble(v);
7931 if (x == -1.0 && PyErr_Occurred())
7932 return -1;
7933 if (prec < 0)
7934 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007935 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7936 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007937 /* Worst case length calc to ensure no buffer overrun:
7938
7939 'g' formats:
7940 fmt = %#.<prec>g
7941 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7942 for any double rep.)
7943 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7944
7945 'f' formats:
7946 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7947 len = 1 + 50 + 1 + prec = 52 + prec
7948
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007949 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007950 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007951
7952 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00007953 if (((type == 'g' || type == 'G') &&
7954 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007955 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007956 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007957 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007958 return -1;
7959 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007960 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7961 (flags&F_ALT) ? "#" : "",
7962 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007963 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964}
7965
Tim Peters38fd5b62000-09-21 05:43:11 +00007966static PyObject*
7967formatlong(PyObject *val, int flags, int prec, int type)
7968{
7969 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00007970 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00007971 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00007972 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00007973
7974 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7975 if (!str)
7976 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00007977 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00007978 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00007979 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00007980}
7981
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982static int
7983formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007984 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 int flags,
7986 int prec,
7987 int type,
7988 PyObject *v)
7989{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007990 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007991 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7992 * + 1 + 1
7993 * = 24
7994 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007995 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007996 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997 long x;
7998
7999 x = PyInt_AsLong(v);
8000 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008001 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008002 if (x < 0 && type == 'u') {
8003 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008004 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008005 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8006 sign = "-";
8007 else
8008 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008010 prec = 1;
8011
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008012 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8013 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008014 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008015 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008016 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008017 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008018 return -1;
8019 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008020
8021 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008022 (type == 'x' || type == 'X' || type == 'o')) {
8023 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008024 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008025 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008026 * - when 0 is being converted, the C standard leaves off
8027 * the '0x' or '0X', which is inconsistent with other
8028 * %#x/%#X conversions and inconsistent with Python's
8029 * hex() function
8030 * - there are platforms that violate the standard and
8031 * convert 0 with the '0x' or '0X'
8032 * (Metrowerks, Compaq Tru64)
8033 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008034 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008035 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008036 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008037 * We can achieve the desired consistency by inserting our
8038 * own '0x' or '0X' prefix, and substituting %x/%X in place
8039 * of %#x/%#X.
8040 *
8041 * Note that this is the same approach as used in
8042 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008043 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008044 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8045 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008046 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008047 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008048 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8049 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008050 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008051 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008052 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008053 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008054 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008055 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056}
8057
8058static int
8059formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008060 size_t buflen,
8061 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008063 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008064 if (PyUnicode_Check(v)) {
8065 if (PyUnicode_GET_SIZE(v) != 1)
8066 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008068 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008070 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008071 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008072 goto onError;
8073 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8074 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075
8076 else {
8077 /* Integer input truncated to a character */
8078 long x;
8079 x = PyInt_AsLong(v);
8080 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008081 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008082#ifdef Py_UNICODE_WIDE
8083 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008084 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008085 "%c arg not in range(0x110000) "
8086 "(wide Python build)");
8087 return -1;
8088 }
8089#else
8090 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008091 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008092 "%c arg not in range(0x10000) "
8093 "(narrow Python build)");
8094 return -1;
8095 }
8096#endif
8097 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098 }
8099 buf[1] = '\0';
8100 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008101
8102 onError:
8103 PyErr_SetString(PyExc_TypeError,
8104 "%c requires int or char");
8105 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106}
8107
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008108/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8109
8110 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8111 chars are formatted. XXX This is a magic number. Each formatting
8112 routine does bounds checking to ensure no overflow, but a better
8113 solution may be to malloc a buffer of appropriate size for each
8114 format. For now, the current solution is sufficient.
8115*/
8116#define FORMATBUFLEN (size_t)120
8117
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118PyObject *PyUnicode_Format(PyObject *format,
8119 PyObject *args)
8120{
8121 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008122 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123 int args_owned = 0;
8124 PyUnicodeObject *result = NULL;
8125 PyObject *dict = NULL;
8126 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008127
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 if (format == NULL || args == NULL) {
8129 PyErr_BadInternalCall();
8130 return NULL;
8131 }
8132 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008133 if (uformat == NULL)
8134 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 fmt = PyUnicode_AS_UNICODE(uformat);
8136 fmtcnt = PyUnicode_GET_SIZE(uformat);
8137
8138 reslen = rescnt = fmtcnt + 100;
8139 result = _PyUnicode_New(reslen);
8140 if (result == NULL)
8141 goto onError;
8142 res = PyUnicode_AS_UNICODE(result);
8143
8144 if (PyTuple_Check(args)) {
8145 arglen = PyTuple_Size(args);
8146 argidx = 0;
8147 }
8148 else {
8149 arglen = -1;
8150 argidx = -2;
8151 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008152 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008153 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154 dict = args;
8155
8156 while (--fmtcnt >= 0) {
8157 if (*fmt != '%') {
8158 if (--rescnt < 0) {
8159 rescnt = fmtcnt + 100;
8160 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008161 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008162 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8164 --rescnt;
8165 }
8166 *res++ = *fmt++;
8167 }
8168 else {
8169 /* Got a format specifier */
8170 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008171 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173 Py_UNICODE c = '\0';
8174 Py_UNICODE fill;
8175 PyObject *v = NULL;
8176 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008177 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008179 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008180 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181
8182 fmt++;
8183 if (*fmt == '(') {
8184 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008185 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186 PyObject *key;
8187 int pcount = 1;
8188
8189 if (dict == NULL) {
8190 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008191 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192 goto onError;
8193 }
8194 ++fmt;
8195 --fmtcnt;
8196 keystart = fmt;
8197 /* Skip over balanced parentheses */
8198 while (pcount > 0 && --fmtcnt >= 0) {
8199 if (*fmt == ')')
8200 --pcount;
8201 else if (*fmt == '(')
8202 ++pcount;
8203 fmt++;
8204 }
8205 keylen = fmt - keystart - 1;
8206 if (fmtcnt < 0 || pcount > 0) {
8207 PyErr_SetString(PyExc_ValueError,
8208 "incomplete format key");
8209 goto onError;
8210 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008211#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008212 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213 then looked up since Python uses strings to hold
8214 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008215 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 key = PyUnicode_EncodeUTF8(keystart,
8217 keylen,
8218 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008219#else
8220 key = PyUnicode_FromUnicode(keystart, keylen);
8221#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222 if (key == NULL)
8223 goto onError;
8224 if (args_owned) {
8225 Py_DECREF(args);
8226 args_owned = 0;
8227 }
8228 args = PyObject_GetItem(dict, key);
8229 Py_DECREF(key);
8230 if (args == NULL) {
8231 goto onError;
8232 }
8233 args_owned = 1;
8234 arglen = -1;
8235 argidx = -2;
8236 }
8237 while (--fmtcnt >= 0) {
8238 switch (c = *fmt++) {
8239 case '-': flags |= F_LJUST; continue;
8240 case '+': flags |= F_SIGN; continue;
8241 case ' ': flags |= F_BLANK; continue;
8242 case '#': flags |= F_ALT; continue;
8243 case '0': flags |= F_ZERO; continue;
8244 }
8245 break;
8246 }
8247 if (c == '*') {
8248 v = getnextarg(args, arglen, &argidx);
8249 if (v == NULL)
8250 goto onError;
8251 if (!PyInt_Check(v)) {
8252 PyErr_SetString(PyExc_TypeError,
8253 "* wants int");
8254 goto onError;
8255 }
8256 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008257 if (width == -1 && PyErr_Occurred())
8258 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259 if (width < 0) {
8260 flags |= F_LJUST;
8261 width = -width;
8262 }
8263 if (--fmtcnt >= 0)
8264 c = *fmt++;
8265 }
8266 else if (c >= '0' && c <= '9') {
8267 width = c - '0';
8268 while (--fmtcnt >= 0) {
8269 c = *fmt++;
8270 if (c < '0' || c > '9')
8271 break;
8272 if ((width*10) / 10 != width) {
8273 PyErr_SetString(PyExc_ValueError,
8274 "width too big");
8275 goto onError;
8276 }
8277 width = width*10 + (c - '0');
8278 }
8279 }
8280 if (c == '.') {
8281 prec = 0;
8282 if (--fmtcnt >= 0)
8283 c = *fmt++;
8284 if (c == '*') {
8285 v = getnextarg(args, arglen, &argidx);
8286 if (v == NULL)
8287 goto onError;
8288 if (!PyInt_Check(v)) {
8289 PyErr_SetString(PyExc_TypeError,
8290 "* wants int");
8291 goto onError;
8292 }
8293 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008294 if (prec == -1 && PyErr_Occurred())
8295 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296 if (prec < 0)
8297 prec = 0;
8298 if (--fmtcnt >= 0)
8299 c = *fmt++;
8300 }
8301 else if (c >= '0' && c <= '9') {
8302 prec = c - '0';
8303 while (--fmtcnt >= 0) {
8304 c = Py_CHARMASK(*fmt++);
8305 if (c < '0' || c > '9')
8306 break;
8307 if ((prec*10) / 10 != prec) {
8308 PyErr_SetString(PyExc_ValueError,
8309 "prec too big");
8310 goto onError;
8311 }
8312 prec = prec*10 + (c - '0');
8313 }
8314 }
8315 } /* prec */
8316 if (fmtcnt >= 0) {
8317 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318 if (--fmtcnt >= 0)
8319 c = *fmt++;
8320 }
8321 }
8322 if (fmtcnt < 0) {
8323 PyErr_SetString(PyExc_ValueError,
8324 "incomplete format");
8325 goto onError;
8326 }
8327 if (c != '%') {
8328 v = getnextarg(args, arglen, &argidx);
8329 if (v == NULL)
8330 goto onError;
8331 }
8332 sign = 0;
8333 fill = ' ';
8334 switch (c) {
8335
8336 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008337 pbuf = formatbuf;
8338 /* presume that buffer length is at least 1 */
8339 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340 len = 1;
8341 break;
8342
8343 case 's':
8344 case 'r':
8345 if (PyUnicode_Check(v) && c == 's') {
8346 temp = v;
8347 Py_INCREF(temp);
8348 }
8349 else {
8350 PyObject *unicode;
8351 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008352 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353 else
8354 temp = PyObject_Repr(v);
8355 if (temp == NULL)
8356 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008357 if (PyUnicode_Check(temp))
8358 /* nothing to do */;
8359 else if (PyString_Check(temp)) {
8360 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008361 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008363 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008365 Py_DECREF(temp);
8366 temp = unicode;
8367 if (temp == NULL)
8368 goto onError;
8369 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008370 else {
8371 Py_DECREF(temp);
8372 PyErr_SetString(PyExc_TypeError,
8373 "%s argument has non-string str()");
8374 goto onError;
8375 }
8376 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008377 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 len = PyUnicode_GET_SIZE(temp);
8379 if (prec >= 0 && len > prec)
8380 len = prec;
8381 break;
8382
8383 case 'i':
8384 case 'd':
8385 case 'u':
8386 case 'o':
8387 case 'x':
8388 case 'X':
8389 if (c == 'i')
8390 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008391 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008392 temp = formatlong(v, flags, prec, c);
8393 if (!temp)
8394 goto onError;
8395 pbuf = PyUnicode_AS_UNICODE(temp);
8396 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008397 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008399 else {
8400 pbuf = formatbuf;
8401 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8402 flags, prec, c, v);
8403 if (len < 0)
8404 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008405 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008406 }
8407 if (flags & F_ZERO)
8408 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008409 break;
8410
8411 case 'e':
8412 case 'E':
8413 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008414 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008415 case 'g':
8416 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008417 if (c == 'F')
8418 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008419 pbuf = formatbuf;
8420 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8421 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422 if (len < 0)
8423 goto onError;
8424 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008425 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426 fill = '0';
8427 break;
8428
8429 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008430 pbuf = formatbuf;
8431 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 if (len < 0)
8433 goto onError;
8434 break;
8435
8436 default:
8437 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008438 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008439 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008440 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008441 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008442 (Py_ssize_t)(fmt - 1 -
8443 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444 goto onError;
8445 }
8446 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008447 if (*pbuf == '-' || *pbuf == '+') {
8448 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449 len--;
8450 }
8451 else if (flags & F_SIGN)
8452 sign = '+';
8453 else if (flags & F_BLANK)
8454 sign = ' ';
8455 else
8456 sign = 0;
8457 }
8458 if (width < len)
8459 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008460 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461 reslen -= rescnt;
8462 rescnt = width + fmtcnt + 100;
8463 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008464 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008465 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008466 PyErr_NoMemory();
8467 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008468 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008469 if (_PyUnicode_Resize(&result, reslen) < 0) {
8470 Py_XDECREF(temp);
8471 goto onError;
8472 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008473 res = PyUnicode_AS_UNICODE(result)
8474 + reslen - rescnt;
8475 }
8476 if (sign) {
8477 if (fill != ' ')
8478 *res++ = sign;
8479 rescnt--;
8480 if (width > len)
8481 width--;
8482 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008483 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008484 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008485 assert(pbuf[1] == c);
8486 if (fill != ' ') {
8487 *res++ = *pbuf++;
8488 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008489 }
Tim Petersfff53252001-04-12 18:38:48 +00008490 rescnt -= 2;
8491 width -= 2;
8492 if (width < 0)
8493 width = 0;
8494 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496 if (width > len && !(flags & F_LJUST)) {
8497 do {
8498 --rescnt;
8499 *res++ = fill;
8500 } while (--width > len);
8501 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008502 if (fill == ' ') {
8503 if (sign)
8504 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008505 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008506 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008507 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008508 *res++ = *pbuf++;
8509 *res++ = *pbuf++;
8510 }
8511 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008512 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008513 res += len;
8514 rescnt -= len;
8515 while (--width >= len) {
8516 --rescnt;
8517 *res++ = ' ';
8518 }
8519 if (dict && (argidx < arglen) && c != '%') {
8520 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008521 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008522 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523 goto onError;
8524 }
8525 Py_XDECREF(temp);
8526 } /* '%' */
8527 } /* until end */
8528 if (argidx < arglen && !dict) {
8529 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008530 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531 goto onError;
8532 }
8533
Thomas Woutersa96affe2006-03-12 00:29:36 +00008534 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8535 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536 if (args_owned) {
8537 Py_DECREF(args);
8538 }
8539 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540 return (PyObject *)result;
8541
8542 onError:
8543 Py_XDECREF(result);
8544 Py_DECREF(uformat);
8545 if (args_owned) {
8546 Py_DECREF(args);
8547 }
8548 return NULL;
8549}
8550
8551static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008552 (readbufferproc) unicode_buffer_getreadbuf,
8553 (writebufferproc) unicode_buffer_getwritebuf,
8554 (segcountproc) unicode_buffer_getsegcount,
8555 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556};
8557
Jeremy Hylton938ace62002-07-17 16:30:39 +00008558static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008559unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8560
Tim Peters6d6c1a32001-08-02 04:15:00 +00008561static PyObject *
8562unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8563{
8564 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008565 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008566 char *encoding = NULL;
8567 char *errors = NULL;
8568
Guido van Rossume023fe02001-08-30 03:12:59 +00008569 if (type != &PyUnicode_Type)
8570 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008571 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8572 kwlist, &x, &encoding, &errors))
8573 return NULL;
8574 if (x == NULL)
8575 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008576 if (encoding == NULL && errors == NULL)
8577 return PyObject_Unicode(x);
8578 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008579 return PyUnicode_FromEncodedObject(x, encoding, errors);
8580}
8581
Guido van Rossume023fe02001-08-30 03:12:59 +00008582static PyObject *
8583unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8584{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008585 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008586 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008587
8588 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8589 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8590 if (tmp == NULL)
8591 return NULL;
8592 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008593 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008594 if (pnew == NULL) {
8595 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008596 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008597 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008598 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8599 if (pnew->str == NULL) {
8600 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008601 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008602 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008603 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008604 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008605 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8606 pnew->length = n;
8607 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008608 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008609 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008610}
8611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008612PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008613"unicode(string [, encoding[, errors]]) -> object\n\
8614\n\
8615Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008616encoding defaults to the current default string encoding.\n\
8617errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008618
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008619static PyObject *unicode_iter(PyObject *seq);
8620
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008622 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008623 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624 sizeof(PyUnicodeObject), /* tp_size */
8625 0, /* tp_itemsize */
8626 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008627 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008629 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008631 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008632 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008633 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008635 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636 (hashfunc) unicode_hash, /* tp_hash*/
8637 0, /* tp_call*/
8638 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008639 PyObject_GenericGetAttr, /* tp_getattro */
8640 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008642 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8643 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008644 unicode_doc, /* tp_doc */
8645 0, /* tp_traverse */
8646 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008647 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008648 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008649 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008650 0, /* tp_iternext */
8651 unicode_methods, /* tp_methods */
8652 0, /* tp_members */
8653 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008654 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008655 0, /* tp_dict */
8656 0, /* tp_descr_get */
8657 0, /* tp_descr_set */
8658 0, /* tp_dictoffset */
8659 0, /* tp_init */
8660 0, /* tp_alloc */
8661 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008662 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663};
8664
8665/* Initialize the Unicode implementation */
8666
Thomas Wouters78890102000-07-22 19:25:51 +00008667void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008669 int i;
8670
Thomas Wouters477c8d52006-05-27 19:21:47 +00008671 /* XXX - move this array to unicodectype.c ? */
8672 Py_UNICODE linebreak[] = {
8673 0x000A, /* LINE FEED */
8674 0x000D, /* CARRIAGE RETURN */
8675 0x001C, /* FILE SEPARATOR */
8676 0x001D, /* GROUP SEPARATOR */
8677 0x001E, /* RECORD SEPARATOR */
8678 0x0085, /* NEXT LINE */
8679 0x2028, /* LINE SEPARATOR */
8680 0x2029, /* PARAGRAPH SEPARATOR */
8681 };
8682
Fred Drakee4315f52000-05-09 19:53:39 +00008683 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008684 unicode_freelist = NULL;
8685 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008687 if (!unicode_empty)
8688 return;
8689
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008690 for (i = 0; i < 256; i++)
8691 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008692 if (PyType_Ready(&PyUnicode_Type) < 0)
8693 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008694
8695 /* initialize the linebreak bloom filter */
8696 bloom_linebreak = make_bloom_mask(
8697 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8698 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008699
8700 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701}
8702
8703/* Finalize the Unicode implementation */
8704
8705void
Thomas Wouters78890102000-07-22 19:25:51 +00008706_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008707{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008708 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008709 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008711 Py_XDECREF(unicode_empty);
8712 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008713
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008714 for (i = 0; i < 256; i++) {
8715 if (unicode_latin1[i]) {
8716 Py_DECREF(unicode_latin1[i]);
8717 unicode_latin1[i] = NULL;
8718 }
8719 }
8720
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008721 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722 PyUnicodeObject *v = u;
8723 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008724 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008725 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008726 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008727 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008728 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008729 unicode_freelist = NULL;
8730 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008732
Walter Dörwald16807132007-05-25 13:52:07 +00008733void
8734PyUnicode_InternInPlace(PyObject **p)
8735{
8736 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8737 PyObject *t;
8738 if (s == NULL || !PyUnicode_Check(s))
8739 Py_FatalError(
8740 "PyUnicode_InternInPlace: unicode strings only please!");
8741 /* If it's a subclass, we don't really know what putting
8742 it in the interned dict might do. */
8743 if (!PyUnicode_CheckExact(s))
8744 return;
8745 if (PyUnicode_CHECK_INTERNED(s))
8746 return;
8747 if (interned == NULL) {
8748 interned = PyDict_New();
8749 if (interned == NULL) {
8750 PyErr_Clear(); /* Don't leave an exception */
8751 return;
8752 }
8753 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008754 /* It might be that the GetItem call fails even
8755 though the key is present in the dictionary,
8756 namely when this happens during a stack overflow. */
8757 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00008758 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00008759 Py_END_ALLOW_RECURSION
8760
Walter Dörwald16807132007-05-25 13:52:07 +00008761 if (t) {
8762 Py_INCREF(t);
8763 Py_DECREF(*p);
8764 *p = t;
8765 return;
8766 }
8767
Martin v. Löwis5b222132007-06-10 09:51:05 +00008768 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008769 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
8770 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00008771 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008772 return;
8773 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008774 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008775 /* The two references in interned are not counted by refcnt.
8776 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008777 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00008778 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
8779}
8780
8781void
8782PyUnicode_InternImmortal(PyObject **p)
8783{
8784 PyUnicode_InternInPlace(p);
8785 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
8786 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
8787 Py_INCREF(*p);
8788 }
8789}
8790
8791PyObject *
8792PyUnicode_InternFromString(const char *cp)
8793{
8794 PyObject *s = PyUnicode_FromString(cp);
8795 if (s == NULL)
8796 return NULL;
8797 PyUnicode_InternInPlace(&s);
8798 return s;
8799}
8800
8801void _Py_ReleaseInternedUnicodeStrings(void)
8802{
8803 PyObject *keys;
8804 PyUnicodeObject *s;
8805 Py_ssize_t i, n;
8806 Py_ssize_t immortal_size = 0, mortal_size = 0;
8807
8808 if (interned == NULL || !PyDict_Check(interned))
8809 return;
8810 keys = PyDict_Keys(interned);
8811 if (keys == NULL || !PyList_Check(keys)) {
8812 PyErr_Clear();
8813 return;
8814 }
8815
8816 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
8817 detector, interned unicode strings are not forcibly deallocated;
8818 rather, we give them their stolen references back, and then clear
8819 and DECREF the interned dict. */
8820
8821 n = PyList_GET_SIZE(keys);
8822 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
8823 n);
8824 for (i = 0; i < n; i++) {
8825 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
8826 switch (s->state) {
8827 case SSTATE_NOT_INTERNED:
8828 /* XXX Shouldn't happen */
8829 break;
8830 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008831 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008832 immortal_size += s->length;
8833 break;
8834 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008835 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00008836 mortal_size += s->length;
8837 break;
8838 default:
8839 Py_FatalError("Inconsistent interned string state.");
8840 }
8841 s->state = SSTATE_NOT_INTERNED;
8842 }
8843 fprintf(stderr, "total size of all interned strings: "
8844 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
8845 "mortal/immortal\n", mortal_size, immortal_size);
8846 Py_DECREF(keys);
8847 PyDict_Clear(interned);
8848 Py_DECREF(interned);
8849 interned = NULL;
8850}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008851
8852
8853/********************* Unicode Iterator **************************/
8854
8855typedef struct {
8856 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008857 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008858 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8859} unicodeiterobject;
8860
8861static void
8862unicodeiter_dealloc(unicodeiterobject *it)
8863{
8864 _PyObject_GC_UNTRACK(it);
8865 Py_XDECREF(it->it_seq);
8866 PyObject_GC_Del(it);
8867}
8868
8869static int
8870unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8871{
8872 Py_VISIT(it->it_seq);
8873 return 0;
8874}
8875
8876static PyObject *
8877unicodeiter_next(unicodeiterobject *it)
8878{
8879 PyUnicodeObject *seq;
8880 PyObject *item;
8881
8882 assert(it != NULL);
8883 seq = it->it_seq;
8884 if (seq == NULL)
8885 return NULL;
8886 assert(PyUnicode_Check(seq));
8887
8888 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008889 item = PyUnicode_FromUnicode(
8890 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008891 if (item != NULL)
8892 ++it->it_index;
8893 return item;
8894 }
8895
8896 Py_DECREF(seq);
8897 it->it_seq = NULL;
8898 return NULL;
8899}
8900
8901static PyObject *
8902unicodeiter_len(unicodeiterobject *it)
8903{
8904 Py_ssize_t len = 0;
8905 if (it->it_seq)
8906 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8907 return PyInt_FromSsize_t(len);
8908}
8909
8910PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8911
8912static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008913 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8914 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008915 {NULL, NULL} /* sentinel */
8916};
8917
8918PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008919 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008920 "unicodeiterator", /* tp_name */
8921 sizeof(unicodeiterobject), /* tp_basicsize */
8922 0, /* tp_itemsize */
8923 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008924 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008925 0, /* tp_print */
8926 0, /* tp_getattr */
8927 0, /* tp_setattr */
8928 0, /* tp_compare */
8929 0, /* tp_repr */
8930 0, /* tp_as_number */
8931 0, /* tp_as_sequence */
8932 0, /* tp_as_mapping */
8933 0, /* tp_hash */
8934 0, /* tp_call */
8935 0, /* tp_str */
8936 PyObject_GenericGetAttr, /* tp_getattro */
8937 0, /* tp_setattro */
8938 0, /* tp_as_buffer */
8939 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8940 0, /* tp_doc */
8941 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8942 0, /* tp_clear */
8943 0, /* tp_richcompare */
8944 0, /* tp_weaklistoffset */
8945 PyObject_SelfIter, /* tp_iter */
8946 (iternextfunc)unicodeiter_next, /* tp_iternext */
8947 unicodeiter_methods, /* tp_methods */
8948 0,
8949};
8950
8951static PyObject *
8952unicode_iter(PyObject *seq)
8953{
8954 unicodeiterobject *it;
8955
8956 if (!PyUnicode_Check(seq)) {
8957 PyErr_BadInternalCall();
8958 return NULL;
8959 }
8960 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8961 if (it == NULL)
8962 return NULL;
8963 it->it_index = 0;
8964 Py_INCREF(seq);
8965 it->it_seq = (PyUnicodeObject *)seq;
8966 _PyObject_GC_TRACK(it);
8967 return (PyObject *)it;
8968}
8969
Martin v. Löwis5b222132007-06-10 09:51:05 +00008970size_t
8971Py_UNICODE_strlen(const Py_UNICODE *u)
8972{
8973 int res = 0;
8974 while(*u++)
8975 res++;
8976 return res;
8977}
8978
8979Py_UNICODE*
8980Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
8981{
8982 Py_UNICODE *u = s1;
8983 while ((*u++ = *s2++));
8984 return s1;
8985}
8986
8987Py_UNICODE*
8988Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
8989{
8990 Py_UNICODE *u = s1;
8991 while ((*u++ = *s2++))
8992 if (n-- == 0)
8993 break;
8994 return s1;
8995}
8996
8997int
8998Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
8999{
9000 while (*s1 && *s2 && *s1 == *s2)
9001 s1++, s2++;
9002 if (*s1 && *s2)
9003 return (*s1 < *s2) ? -1 : +1;
9004 if (*s1)
9005 return 1;
9006 if (*s2)
9007 return -1;
9008 return 0;
9009}
9010
9011Py_UNICODE*
9012Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9013{
9014 const Py_UNICODE *p;
9015 for (p = s; *p; p++)
9016 if (*p == c)
9017 return (Py_UNICODE*)p;
9018 return NULL;
9019}
9020
9021
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009022#ifdef __cplusplus
9023}
9024#endif
9025
9026
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009027/*
9028Local variables:
9029c-basic-offset: 4
9030indent-tabs-mode: nil
9031End:
9032*/