blob: b34598645af74ce57318e2ca17cef5d4a50ac627 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
101 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000106static PyUnicodeObject *unicode_freelist;
107static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Fred Drakee4315f52000-05-09 19:53:39 +0000116/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000117 parameter; it is fixed to "utf-8". Always use the
118 PyUnicode_GetDefaultEncoding() API to access this global. */
119static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000120
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000121Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000122PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000124#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125 return 0x10FFFF;
126#else
127 /* This is actually an illegal character, so it should
128 not be passed to unichr. */
129 return 0xFFFF;
130#endif
131}
132
Thomas Wouters477c8d52006-05-27 19:21:47 +0000133/* --- Bloom Filters ----------------------------------------------------- */
134
135/* stuff to implement simple "bloom filters" for Unicode characters.
136 to keep things simple, we use a single bitmask, using the least 5
137 bits from each unicode characters as the bit index. */
138
139/* the linebreak mask is set up by Unicode_Init below */
140
141#define BLOOM_MASK unsigned long
142
143static BLOOM_MASK bloom_linebreak;
144
145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
146
147#define BLOOM_LINEBREAK(ch)\
148 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
149
150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
151{
152 /* calculate simple bloom-style bitmask for a given unicode string */
153
154 long mask;
155 Py_ssize_t i;
156
157 mask = 0;
158 for (i = 0; i < len; i++)
159 mask |= (1 << (ptr[i] & 0x1F));
160
161 return mask;
162}
163
164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
165{
166 Py_ssize_t i;
167
168 for (i = 0; i < setlen; i++)
169 if (set[i] == chr)
170 return 1;
171
172 return 0;
173}
174
175#define BLOOM_MEMBER(mask, chr, set, setlen)\
176 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
177
Guido van Rossumd57fd912000-03-10 22:53:23 +0000178/* --- Unicode Object ----------------------------------------------------- */
179
180static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000182 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183{
184 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000185
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000186 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190 /* Resizing shared object (unicode_empty or single character
191 objects) in-place is not allowed. Use PyUnicode_Resize()
192 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000194 if (unicode == unicode_empty ||
195 (unicode->length == 1 &&
196 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 return -1;
201 }
202
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203 /* We allocate one more byte to make sure the string is Ux0000 terminated.
204 The overallocation is also used by fastsearch, which assumes that it's
205 safe to look at str[length] (without making any assumptions about what
206 it contains). */
207
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 oldstr = unicode->str;
209 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
210 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 PyErr_NoMemory();
213 return -1;
214 }
215 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000216 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000220 if (unicode->defenc) {
221 Py_DECREF(unicode->defenc);
222 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 }
224 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000225
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 return 0;
227}
228
229/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000230 Ux0000 terminated; some code (e.g. new_identifier)
231 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232
233 XXX This allocator could further be enhanced by assuring that the
234 free list never reduces its size below 1.
235
236*/
237
238static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000239PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000240{
241 register PyUnicodeObject *unicode;
242
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 if (length == 0 && unicode_empty != NULL) {
245 Py_INCREF(unicode_empty);
246 return unicode_empty;
247 }
248
249 /* Unicode freelist & memory allocation */
250 if (unicode_freelist) {
251 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000252 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000255 /* Keep-Alive optimization: we only upsize the buffer,
256 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000257 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000258 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000259 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000260 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 }
262 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000263 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000265 }
266 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 }
268 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000269 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000270 if (unicode == NULL)
271 return NULL;
272 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
273 }
274
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000275 if (!unicode->str) {
276 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000277 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000278 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000280 * the caller fails before initializing str -- unicode_resize()
281 * reads str[0], and the Keep-Alive optimization can keep memory
282 * allocated for str alive across a call to unicode_dealloc(unicode).
283 * We don't want unicode_resize to read uninitialized memory in
284 * that case.
285 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000286 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000288 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000290 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000291 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000293
294 onError:
295 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000296 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000297 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298}
299
300static
Guido van Rossum9475a232001-10-05 20:51:39 +0000301void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302{
Walter Dörwald16807132007-05-25 13:52:07 +0000303 switch (PyUnicode_CHECK_INTERNED(unicode)) {
304 case SSTATE_NOT_INTERNED:
305 break;
306
307 case SSTATE_INTERNED_MORTAL:
308 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000309 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000310 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
311 Py_FatalError(
312 "deletion of interned unicode string failed");
313 break;
314
315 case SSTATE_INTERNED_IMMORTAL:
316 Py_FatalError("Immortal interned unicode string died.");
317
318 default:
319 Py_FatalError("Inconsistent interned unicode string state.");
320 }
321
Guido van Rossum604ddf82001-12-06 20:03:56 +0000322 if (PyUnicode_CheckExact(unicode) &&
323 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000324 /* Keep-Alive optimization */
325 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000326 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327 unicode->str = NULL;
328 unicode->length = 0;
329 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000330 if (unicode->defenc) {
331 Py_DECREF(unicode->defenc);
332 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000333 }
334 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000335 *(PyUnicodeObject **)unicode = unicode_freelist;
336 unicode_freelist = unicode;
337 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 }
339 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000340 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000341 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000342 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 }
344}
345
Martin v. Löwis18e16552006-02-15 17:27:45 +0000346int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000347{
348 register PyUnicodeObject *v;
349
350 /* Argument checks */
351 if (unicode == NULL) {
352 PyErr_BadInternalCall();
353 return -1;
354 }
355 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000356 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000357 PyErr_BadInternalCall();
358 return -1;
359 }
360
361 /* Resizing unicode_empty and single character objects is not
362 possible since these are being shared. We simply return a fresh
363 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000364 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000365 (v == unicode_empty || v->length == 1)) {
366 PyUnicodeObject *w = _PyUnicode_New(length);
367 if (w == NULL)
368 return -1;
369 Py_UNICODE_COPY(w->str, v->str,
370 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000371 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000372 *unicode = (PyObject *)w;
373 return 0;
374 }
375
376 /* Note that we don't have to modify *unicode for unshared Unicode
377 objects, since we can modify them in-place. */
378 return unicode_resize(v, length);
379}
380
381/* Internal API for use in unicodeobject.c only ! */
382#define _PyUnicode_Resize(unicodevar, length) \
383 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000386 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387{
388 PyUnicodeObject *unicode;
389
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000390 /* If the Unicode data is known at construction time, we can apply
391 some optimizations which share commonly used objects. */
392 if (u != NULL) {
393
394 /* Optimization for empty strings */
395 if (size == 0 && unicode_empty != NULL) {
396 Py_INCREF(unicode_empty);
397 return (PyObject *)unicode_empty;
398 }
399
400 /* Single character Unicode objects in the Latin-1 range are
401 shared when using this constructor */
402 if (size == 1 && *u < 256) {
403 unicode = unicode_latin1[*u];
404 if (!unicode) {
405 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000406 if (!unicode)
407 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000408 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000409 unicode_latin1[*u] = unicode;
410 }
411 Py_INCREF(unicode);
412 return (PyObject *)unicode;
413 }
414 }
Tim Petersced69f82003-09-16 20:30:58 +0000415
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 unicode = _PyUnicode_New(size);
417 if (!unicode)
418 return NULL;
419
420 /* Copy the Unicode data into the new object */
421 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423
424 return (PyObject *)unicode;
425}
426
Walter Dörwaldd2034312007-05-18 16:29:38 +0000427PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000428{
429 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000430 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000431 some optimizations which share commonly used objects.
432 Also, this means the input must be UTF-8, so fall back to the
433 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000434 if (u != NULL) {
435
436 /* Optimization for empty strings */
437 if (size == 0 && unicode_empty != NULL) {
438 Py_INCREF(unicode_empty);
439 return (PyObject *)unicode_empty;
440 }
441
Martin v. Löwis9c121062007-08-05 20:26:11 +0000442 /* Single characters are shared when using this constructor.
443 Restrict to ASCII, since the input must be UTF-8. */
444 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000445 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000446 if (!unicode) {
447 unicode = _PyUnicode_New(1);
448 if (!unicode)
449 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000450 unicode->str[0] = Py_CHARMASK(*u);
451 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000452 }
453 Py_INCREF(unicode);
454 return (PyObject *)unicode;
455 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000456
457 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000458 }
459
Walter Dörwald55507312007-05-18 13:12:10 +0000460 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000461 if (!unicode)
462 return NULL;
463
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000464 return (PyObject *)unicode;
465}
466
Walter Dörwaldd2034312007-05-18 16:29:38 +0000467PyObject *PyUnicode_FromString(const char *u)
468{
469 size_t size = strlen(u);
470 if (size > PY_SSIZE_T_MAX) {
471 PyErr_SetString(PyExc_OverflowError, "input too long");
472 return NULL;
473 }
474
475 return PyUnicode_FromStringAndSize(u, size);
476}
477
Guido van Rossumd57fd912000-03-10 22:53:23 +0000478#ifdef HAVE_WCHAR_H
479
480PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000481 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000482{
483 PyUnicodeObject *unicode;
484
485 if (w == NULL) {
486 PyErr_BadInternalCall();
487 return NULL;
488 }
489
490 unicode = _PyUnicode_New(size);
491 if (!unicode)
492 return NULL;
493
494 /* Copy the wchar_t data into the new object */
495#ifdef HAVE_USABLE_WCHAR_T
496 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000497#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 {
499 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000500 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000502 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 *u++ = *w++;
504 }
505#endif
506
507 return (PyObject *)unicode;
508}
509
Walter Dörwald346737f2007-05-31 10:44:43 +0000510static void
511makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
512{
513 *fmt++ = '%';
514 if (width) {
515 if (zeropad)
516 *fmt++ = '0';
517 fmt += sprintf(fmt, "%d", width);
518 }
519 if (precision)
520 fmt += sprintf(fmt, ".%d", precision);
521 if (longflag)
522 *fmt++ = 'l';
523 else if (size_tflag) {
524 char *f = PY_FORMAT_SIZE_T;
525 while (*f)
526 *fmt++ = *f++;
527 }
528 *fmt++ = c;
529 *fmt = '\0';
530}
531
Walter Dörwaldd2034312007-05-18 16:29:38 +0000532#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
533
534PyObject *
535PyUnicode_FromFormatV(const char *format, va_list vargs)
536{
537 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000538 Py_ssize_t callcount = 0;
539 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000540 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000541 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000542 int width = 0;
543 int precision = 0;
544 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000545 const char* f;
546 Py_UNICODE *s;
547 PyObject *string;
548 /* used by sprintf */
549 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000550 /* use abuffer instead of buffer, if we need more space
551 * (which can happen if there's a format specifier with width). */
552 char *abuffer = NULL;
553 char *realbuffer;
554 Py_ssize_t abuffersize = 0;
555 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000556 const char *copy;
557
558#ifdef VA_LIST_IS_ARRAY
559 Py_MEMCPY(count, vargs, sizeof(va_list));
560#else
561#ifdef __va_copy
562 __va_copy(count, vargs);
563#else
564 count = vargs;
565#endif
566#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000567 /* step 1: count the number of %S/%R format specifications
568 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
569 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000570 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000571 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000572 ++callcount;
573 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000574 /* step 2: allocate memory for the results of
575 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000576 if (callcount) {
577 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
578 if (!callresults) {
579 PyErr_NoMemory();
580 return NULL;
581 }
582 callresult = callresults;
583 }
584 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000585 for (f = format; *f; f++) {
586 if (*f == '%') {
587 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000588 width = 0;
589 while (isdigit(Py_CHARMASK(*f)))
590 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000591 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
592 ;
593
594 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
595 * they don't affect the amount of space we reserve.
596 */
597 if ((*f == 'l' || *f == 'z') &&
598 (f[1] == 'd' || f[1] == 'u'))
599 ++f;
600
601 switch (*f) {
602 case 'c':
603 (void)va_arg(count, int);
604 /* fall through... */
605 case '%':
606 n++;
607 break;
608 case 'd': case 'u': case 'i': case 'x':
609 (void) va_arg(count, int);
610 /* 20 bytes is enough to hold a 64-bit
611 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000612 This isn't enough for octal.
613 If a width is specified we need more
614 (which we allocate later). */
615 if (width < 20)
616 width = 20;
617 n += width;
618 if (abuffersize < width)
619 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000620 break;
621 case 's':
622 n += strlen(va_arg(count, char*));
623 break;
624 case 'U':
625 {
626 PyObject *obj = va_arg(count, PyObject *);
627 assert(obj && PyUnicode_Check(obj));
628 n += PyUnicode_GET_SIZE(obj);
629 break;
630 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000631 case 'V':
632 {
633 PyObject *obj = va_arg(count, PyObject *);
634 const char *str = va_arg(count, const char *);
635 assert(obj || str);
636 assert(!obj || PyUnicode_Check(obj));
637 if (obj)
638 n += PyUnicode_GET_SIZE(obj);
639 else
640 n += strlen(str);
641 break;
642 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000643 case 'S':
644 {
645 PyObject *obj = va_arg(count, PyObject *);
646 PyObject *str;
647 assert(obj);
648 str = PyObject_Unicode(obj);
649 if (!str)
650 goto fail;
651 n += PyUnicode_GET_SIZE(str);
652 /* Remember the str and switch to the next slot */
653 *callresult++ = str;
654 break;
655 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000656 case 'R':
657 {
658 PyObject *obj = va_arg(count, PyObject *);
659 PyObject *repr;
660 assert(obj);
661 repr = PyObject_Repr(obj);
662 if (!repr)
663 goto fail;
664 n += PyUnicode_GET_SIZE(repr);
665 /* Remember the repr and switch to the next slot */
666 *callresult++ = repr;
667 break;
668 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000669 case 'p':
670 (void) va_arg(count, int);
671 /* maximum 64-bit pointer representation:
672 * 0xffffffffffffffff
673 * so 19 characters is enough.
674 * XXX I count 18 -- what's the extra for?
675 */
676 n += 19;
677 break;
678 default:
679 /* if we stumble upon an unknown
680 formatting code, copy the rest of
681 the format string to the output
682 string. (we cannot just skip the
683 code, since there's no way to know
684 what's in the argument list) */
685 n += strlen(p);
686 goto expand;
687 }
688 } else
689 n++;
690 }
691 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000692 if (abuffersize > 20) {
693 abuffer = PyMem_Malloc(abuffersize);
694 if (!abuffer) {
695 PyErr_NoMemory();
696 goto fail;
697 }
698 realbuffer = abuffer;
699 }
700 else
701 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000702 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000703 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000704 we don't have to resize the string.
705 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000706 string = PyUnicode_FromUnicode(NULL, n);
707 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000708 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000709
710 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000711 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000712
713 for (f = format; *f; f++) {
714 if (*f == '%') {
715 const char* p = f++;
716 int longflag = 0;
717 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000718 zeropad = (*f == '0');
719 /* parse the width.precision part */
720 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000721 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000722 width = (width*10) + *f++ - '0';
723 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724 if (*f == '.') {
725 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000726 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000727 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000729 /* handle the long flag, but only for %ld and %lu.
730 others can be added when necessary. */
731 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
732 longflag = 1;
733 ++f;
734 }
735 /* handle the size_t flag. */
736 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
737 size_tflag = 1;
738 ++f;
739 }
740
741 switch (*f) {
742 case 'c':
743 *s++ = va_arg(vargs, int);
744 break;
745 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000746 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000747 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000748 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000749 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000750 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000751 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000752 sprintf(realbuffer, fmt, va_arg(vargs, int));
753 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000754 break;
755 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000756 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000758 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000760 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000761 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000762 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
763 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000764 break;
765 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000766 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
767 sprintf(realbuffer, fmt, va_arg(vargs, int));
768 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000769 break;
770 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000771 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
772 sprintf(realbuffer, fmt, va_arg(vargs, int));
773 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000774 break;
775 case 's':
776 p = va_arg(vargs, char*);
777 appendstring(p);
778 break;
779 case 'U':
780 {
781 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000782 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
783 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
784 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000785 break;
786 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000787 case 'V':
788 {
789 PyObject *obj = va_arg(vargs, PyObject *);
790 const char *str = va_arg(vargs, const char *);
791 if (obj) {
792 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
793 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
794 s += size;
795 } else {
796 appendstring(str);
797 }
798 break;
799 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000800 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000801 case 'R':
802 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000803 Py_UNICODE *ucopy;
804 Py_ssize_t usize;
805 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000806 /* unused, since we already have the result */
807 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000808 ucopy = PyUnicode_AS_UNICODE(*callresult);
809 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000810 for (upos = 0; upos<usize;)
811 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000812 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000813 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000814 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000815 ++callresult;
816 break;
817 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000818 case 'p':
819 sprintf(buffer, "%p", va_arg(vargs, void*));
820 /* %p is ill-defined: ensure leading 0x. */
821 if (buffer[1] == 'X')
822 buffer[1] = 'x';
823 else if (buffer[1] != 'x') {
824 memmove(buffer+2, buffer, strlen(buffer)+1);
825 buffer[0] = '0';
826 buffer[1] = 'x';
827 }
828 appendstring(buffer);
829 break;
830 case '%':
831 *s++ = '%';
832 break;
833 default:
834 appendstring(p);
835 goto end;
836 }
837 } else
838 *s++ = *f;
839 }
840
841 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000842 if (callresults)
843 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000844 if (abuffer)
845 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000846 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
847 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000848 fail:
849 if (callresults) {
850 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000851 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000852 Py_DECREF(*callresult2);
853 ++callresult2;
854 }
855 PyMem_Free(callresults);
856 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000857 if (abuffer)
858 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000859 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000860}
861
862#undef appendstring
863
864PyObject *
865PyUnicode_FromFormat(const char *format, ...)
866{
867 PyObject* ret;
868 va_list vargs;
869
870#ifdef HAVE_STDARG_PROTOTYPES
871 va_start(vargs, format);
872#else
873 va_start(vargs);
874#endif
875 ret = PyUnicode_FromFormatV(format, vargs);
876 va_end(vargs);
877 return ret;
878}
879
Martin v. Löwis18e16552006-02-15 17:27:45 +0000880Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
881 wchar_t *w,
882 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883{
884 if (unicode == NULL) {
885 PyErr_BadInternalCall();
886 return -1;
887 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000888
889 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000891 size = PyUnicode_GET_SIZE(unicode) + 1;
892
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893#ifdef HAVE_USABLE_WCHAR_T
894 memcpy(w, unicode->str, size * sizeof(wchar_t));
895#else
896 {
897 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000898 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000899 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000900 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000901 *w++ = *u++;
902 }
903#endif
904
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000905 if (size > PyUnicode_GET_SIZE(unicode))
906 return PyUnicode_GET_SIZE(unicode);
907 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000908 return size;
909}
910
911#endif
912
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000913PyObject *PyUnicode_FromOrdinal(int ordinal)
914{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000915 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000916
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000917 if (ordinal < 0 || ordinal > 0x10ffff) {
918 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000919 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000920 return NULL;
921 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000922
923#ifndef Py_UNICODE_WIDE
924 if (ordinal > 0xffff) {
925 ordinal -= 0x10000;
926 s[0] = 0xD800 | (ordinal >> 10);
927 s[1] = 0xDC00 | (ordinal & 0x3FF);
928 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000929 }
930#endif
931
Hye-Shik Chang40574832004-04-06 07:24:51 +0000932 s[0] = (Py_UNICODE)ordinal;
933 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000934}
935
Guido van Rossumd57fd912000-03-10 22:53:23 +0000936PyObject *PyUnicode_FromObject(register PyObject *obj)
937{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000938 /* XXX Perhaps we should make this API an alias of
939 PyObject_Unicode() instead ?! */
940 if (PyUnicode_CheckExact(obj)) {
941 Py_INCREF(obj);
942 return obj;
943 }
944 if (PyUnicode_Check(obj)) {
945 /* For a Unicode subtype that's not a Unicode object,
946 return a true Unicode object with the same data. */
947 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
948 PyUnicode_GET_SIZE(obj));
949 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000950 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
951}
952
953PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
954 const char *encoding,
955 const char *errors)
956{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000957 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000958 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000959 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000960
Guido van Rossumd57fd912000-03-10 22:53:23 +0000961 if (obj == NULL) {
962 PyErr_BadInternalCall();
963 return NULL;
964 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000965
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000966#if 0
967 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000968 that no encodings is given and then redirect to
969 PyObject_Unicode() which then applies the additional logic for
970 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000971
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000972 NOTE: This API should really only be used for object which
973 represent *encoded* Unicode !
974
975 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000976 if (PyUnicode_Check(obj)) {
977 if (encoding) {
978 PyErr_SetString(PyExc_TypeError,
979 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000980 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000981 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000982 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000983 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000984#else
985 if (PyUnicode_Check(obj)) {
986 PyErr_SetString(PyExc_TypeError,
987 "decoding Unicode is not supported");
988 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000989 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000990#endif
991
992 /* Coerce object */
993 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000994 s = PyString_AS_STRING(obj);
995 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000996 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000997 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
998 /* Overwrite the error message with something more useful in
999 case of a TypeError. */
1000 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001001 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001002 "coercing to Unicode: need string or buffer, "
1003 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001004 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001005 goto onError;
1006 }
Tim Petersced69f82003-09-16 20:30:58 +00001007
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001008 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001009 if (len == 0) {
1010 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001011 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001012 }
Tim Petersced69f82003-09-16 20:30:58 +00001013 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001014 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001015
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001016 return v;
1017
1018 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001019 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001020}
1021
1022PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001023 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024 const char *encoding,
1025 const char *errors)
1026{
1027 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001028
1029 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001030 encoding = PyUnicode_GetDefaultEncoding();
1031
1032 /* Shortcuts for common default encodings */
1033 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001035 else if (strcmp(encoding, "latin-1") == 0)
1036 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001037#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1038 else if (strcmp(encoding, "mbcs") == 0)
1039 return PyUnicode_DecodeMBCS(s, size, errors);
1040#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001041 else if (strcmp(encoding, "ascii") == 0)
1042 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043
1044 /* Decode via the codec registry */
1045 buffer = PyBuffer_FromMemory((void *)s, size);
1046 if (buffer == NULL)
1047 goto onError;
1048 unicode = PyCodec_Decode(buffer, encoding, errors);
1049 if (unicode == NULL)
1050 goto onError;
1051 if (!PyUnicode_Check(unicode)) {
1052 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001053 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001054 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001055 Py_DECREF(unicode);
1056 goto onError;
1057 }
1058 Py_DECREF(buffer);
1059 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001060
Guido van Rossumd57fd912000-03-10 22:53:23 +00001061 onError:
1062 Py_XDECREF(buffer);
1063 return NULL;
1064}
1065
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001066PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1067 const char *encoding,
1068 const char *errors)
1069{
1070 PyObject *v;
1071
1072 if (!PyUnicode_Check(unicode)) {
1073 PyErr_BadArgument();
1074 goto onError;
1075 }
1076
1077 if (encoding == NULL)
1078 encoding = PyUnicode_GetDefaultEncoding();
1079
1080 /* Decode via the codec registry */
1081 v = PyCodec_Decode(unicode, encoding, errors);
1082 if (v == NULL)
1083 goto onError;
1084 return v;
1085
1086 onError:
1087 return NULL;
1088}
1089
Guido van Rossumd57fd912000-03-10 22:53:23 +00001090PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001091 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092 const char *encoding,
1093 const char *errors)
1094{
1095 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001096
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097 unicode = PyUnicode_FromUnicode(s, size);
1098 if (unicode == NULL)
1099 return NULL;
1100 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1101 Py_DECREF(unicode);
1102 return v;
1103}
1104
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001105PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1106 const char *encoding,
1107 const char *errors)
1108{
1109 PyObject *v;
1110
1111 if (!PyUnicode_Check(unicode)) {
1112 PyErr_BadArgument();
1113 goto onError;
1114 }
1115
1116 if (encoding == NULL)
1117 encoding = PyUnicode_GetDefaultEncoding();
1118
1119 /* Encode via the codec registry */
1120 v = PyCodec_Encode(unicode, encoding, errors);
1121 if (v == NULL)
1122 goto onError;
1123 return v;
1124
1125 onError:
1126 return NULL;
1127}
1128
Guido van Rossumd57fd912000-03-10 22:53:23 +00001129PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1130 const char *encoding,
1131 const char *errors)
1132{
1133 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001134
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135 if (!PyUnicode_Check(unicode)) {
1136 PyErr_BadArgument();
1137 goto onError;
1138 }
Fred Drakee4315f52000-05-09 19:53:39 +00001139
Tim Petersced69f82003-09-16 20:30:58 +00001140 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001141 encoding = PyUnicode_GetDefaultEncoding();
1142
1143 /* Shortcuts for common default encodings */
1144 if (errors == NULL) {
1145 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001146 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001147 else if (strcmp(encoding, "latin-1") == 0)
1148 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001149#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1150 else if (strcmp(encoding, "mbcs") == 0)
1151 return PyUnicode_AsMBCSString(unicode);
1152#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001153 else if (strcmp(encoding, "ascii") == 0)
1154 return PyUnicode_AsASCIIString(unicode);
1155 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156
1157 /* Encode via the codec registry */
1158 v = PyCodec_Encode(unicode, encoding, errors);
1159 if (v == NULL)
1160 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001161 if (!PyBytes_Check(v)) {
1162 if (PyString_Check(v)) {
1163 /* Old codec, turn it into bytes */
1164 PyObject *b = PyBytes_FromObject(v);
1165 Py_DECREF(v);
1166 return b;
1167 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001169 "encoder did not return a bytes object "
1170 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1171 v->ob_type->tp_name,
1172 encoding ? encoding : "NULL",
1173 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 Py_DECREF(v);
1175 goto onError;
1176 }
1177 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001178
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 onError:
1180 return NULL;
1181}
1182
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001183PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1184 const char *errors)
1185{
1186 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001187 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001188 if (v)
1189 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001190 if (errors != NULL)
1191 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum06610092007-08-16 21:02:22 +00001192 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1193 PyUnicode_GET_SIZE(unicode),
1194 NULL);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001195 if (!b)
1196 return NULL;
1197 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1198 PyBytes_Size(b));
1199 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001200 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001201 return v;
1202}
1203
Martin v. Löwis5b222132007-06-10 09:51:05 +00001204char*
1205PyUnicode_AsString(PyObject *unicode)
1206{
1207 assert(PyUnicode_Check(unicode));
1208 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1209 if (!unicode)
1210 return NULL;
1211 return PyString_AsString(unicode);
1212}
1213
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1215{
1216 if (!PyUnicode_Check(unicode)) {
1217 PyErr_BadArgument();
1218 goto onError;
1219 }
1220 return PyUnicode_AS_UNICODE(unicode);
1221
1222 onError:
1223 return NULL;
1224}
1225
Martin v. Löwis18e16552006-02-15 17:27:45 +00001226Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227{
1228 if (!PyUnicode_Check(unicode)) {
1229 PyErr_BadArgument();
1230 goto onError;
1231 }
1232 return PyUnicode_GET_SIZE(unicode);
1233
1234 onError:
1235 return -1;
1236}
1237
Thomas Wouters78890102000-07-22 19:25:51 +00001238const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001239{
1240 return unicode_default_encoding;
1241}
1242
1243int PyUnicode_SetDefaultEncoding(const char *encoding)
1244{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001245 if (strcmp(encoding, unicode_default_encoding) != 0) {
1246 PyErr_Format(PyExc_ValueError,
1247 "Can only set default encoding to %s",
1248 unicode_default_encoding);
1249 return -1;
1250 }
Fred Drakee4315f52000-05-09 19:53:39 +00001251 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001252}
1253
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001254/* error handling callback helper:
1255 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001256 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001257 and adjust various state variables.
1258 return 0 on success, -1 on error
1259*/
1260
1261static
1262int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1263 const char *encoding, const char *reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001264 const char **input, const char **inend, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001265 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001266{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001267 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001268
1269 PyObject *restuple = NULL;
1270 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001271 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001272 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001273 Py_ssize_t requiredsize;
1274 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001275 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001276 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001277 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278 int res = -1;
1279
1280 if (*errorHandler == NULL) {
1281 *errorHandler = PyCodec_LookupError(errors);
1282 if (*errorHandler == NULL)
1283 goto onError;
1284 }
1285
1286 if (*exceptionObject == NULL) {
1287 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001288 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001289 if (*exceptionObject == NULL)
1290 goto onError;
1291 }
1292 else {
1293 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1294 goto onError;
1295 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1296 goto onError;
1297 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1298 goto onError;
1299 }
1300
1301 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1302 if (restuple == NULL)
1303 goto onError;
1304 if (!PyTuple_Check(restuple)) {
1305 PyErr_Format(PyExc_TypeError, &argparse[4]);
1306 goto onError;
1307 }
1308 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1309 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001310
1311 /* Copy back the bytes variables, which might have been modified by the
1312 callback */
1313 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1314 if (!inputobj)
1315 goto onError;
1316 if (!PyBytes_Check(inputobj)) {
1317 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1318 }
1319 *input = PyBytes_AS_STRING(inputobj);
1320 insize = PyBytes_GET_SIZE(inputobj);
1321 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001322 /* we can DECREF safely, as the exception has another reference,
1323 so the object won't go away. */
1324 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001325
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001327 newpos = insize+newpos;
1328 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001329 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001330 goto onError;
1331 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001332
1333 /* need more space? (at least enough for what we
1334 have+the replacement+the rest of the string (starting
1335 at the new input position), so we won't have to check space
1336 when there are no errors in the rest of the string) */
1337 repptr = PyUnicode_AS_UNICODE(repunicode);
1338 repsize = PyUnicode_GET_SIZE(repunicode);
1339 requiredsize = *outpos + repsize + insize-newpos;
1340 if (requiredsize > outsize) {
1341 if (requiredsize<2*outsize)
1342 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001343 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344 goto onError;
1345 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1346 }
1347 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001348 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001349 Py_UNICODE_COPY(*outptr, repptr, repsize);
1350 *outptr += repsize;
1351 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001352
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001353 /* we made it! */
1354 res = 0;
1355
1356 onError:
1357 Py_XDECREF(restuple);
1358 return res;
1359}
1360
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001361/* --- UTF-7 Codec -------------------------------------------------------- */
1362
1363/* see RFC2152 for details */
1364
Tim Petersced69f82003-09-16 20:30:58 +00001365static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001366char utf7_special[128] = {
1367 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1368 encoded:
1369 0 - not special
1370 1 - special
1371 2 - whitespace (optional)
1372 3 - RFC2152 Set O (optional) */
1373 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1374 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1375 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1376 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1377 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1378 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1379 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1380 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1381
1382};
1383
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001384/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1385 warnings about the comparison always being false; since
1386 utf7_special[0] is 1, we can safely make that one comparison
1387 true */
1388
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001389#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001390 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001391 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001392 (encodeO && (utf7_special[(c)] == 3)))
1393
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001394#define B64(n) \
1395 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1396#define B64CHAR(c) \
1397 (isalnum(c) || (c) == '+' || (c) == '/')
1398#define UB64(c) \
1399 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1400 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001401
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001402#define ENCODE(out, ch, bits) \
1403 while (bits >= 6) { \
1404 *out++ = B64(ch >> (bits-6)); \
1405 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001406 }
1407
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001408#define DECODE(out, ch, bits, surrogate) \
1409 while (bits >= 16) { \
1410 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1411 bits -= 16; \
1412 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001413 /* We have already generated an error for the high surrogate \
1414 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001415 surrogate = 0; \
1416 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001417 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001418 it in a 16-bit character */ \
1419 surrogate = 1; \
1420 errmsg = "code pairs are not supported"; \
1421 goto utf7Error; \
1422 } else { \
1423 *out++ = outCh; \
1424 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001425 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001426
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001427PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001428 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001429 const char *errors)
1430{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001431 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001432 Py_ssize_t startinpos;
1433 Py_ssize_t endinpos;
1434 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001435 const char *e;
1436 PyUnicodeObject *unicode;
1437 Py_UNICODE *p;
1438 const char *errmsg = "";
1439 int inShift = 0;
1440 unsigned int bitsleft = 0;
1441 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001442 int surrogate = 0;
1443 PyObject *errorHandler = NULL;
1444 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001445
1446 unicode = _PyUnicode_New(size);
1447 if (!unicode)
1448 return NULL;
1449 if (size == 0)
1450 return (PyObject *)unicode;
1451
1452 p = unicode->str;
1453 e = s + size;
1454
1455 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001456 Py_UNICODE ch;
1457 restart:
1458 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001459
1460 if (inShift) {
1461 if ((ch == '-') || !B64CHAR(ch)) {
1462 inShift = 0;
1463 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001464
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001465 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1466 if (bitsleft >= 6) {
1467 /* The shift sequence has a partial character in it. If
1468 bitsleft < 6 then we could just classify it as padding
1469 but that is not the case here */
1470
1471 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001472 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001473 }
1474 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001475 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001476 here so indicate the potential of a misencoded character. */
1477
1478 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1479 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1480 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001481 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001482 }
1483
1484 if (ch == '-') {
1485 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001486 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001487 inShift = 1;
1488 }
1489 } else if (SPECIAL(ch,0,0)) {
1490 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001491 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001492 } else {
1493 *p++ = ch;
1494 }
1495 } else {
1496 charsleft = (charsleft << 6) | UB64(ch);
1497 bitsleft += 6;
1498 s++;
1499 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1500 }
1501 }
1502 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001503 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001504 s++;
1505 if (s < e && *s == '-') {
1506 s++;
1507 *p++ = '+';
1508 } else
1509 {
1510 inShift = 1;
1511 bitsleft = 0;
1512 }
1513 }
1514 else if (SPECIAL(ch,0,0)) {
1515 errmsg = "unexpected special character";
1516 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001517 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001518 }
1519 else {
1520 *p++ = ch;
1521 s++;
1522 }
1523 continue;
1524 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 outpos = p-PyUnicode_AS_UNICODE(unicode);
1526 endinpos = s-starts;
1527 if (unicode_decode_call_errorhandler(
1528 errors, &errorHandler,
1529 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001530 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001531 (PyObject **)&unicode, &outpos, &p))
1532 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001533 }
1534
1535 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001536 outpos = p-PyUnicode_AS_UNICODE(unicode);
1537 endinpos = size;
1538 if (unicode_decode_call_errorhandler(
1539 errors, &errorHandler,
1540 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001541 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001542 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001543 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001544 if (s < e)
1545 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001546 }
1547
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001548 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001549 goto onError;
1550
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001551 Py_XDECREF(errorHandler);
1552 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001553 return (PyObject *)unicode;
1554
1555onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001556 Py_XDECREF(errorHandler);
1557 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001558 Py_DECREF(unicode);
1559 return NULL;
1560}
1561
1562
1563PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001564 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001565 int encodeSetO,
1566 int encodeWhiteSpace,
1567 const char *errors)
1568{
1569 PyObject *v;
1570 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001571 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001572 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001573 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001574 unsigned int bitsleft = 0;
1575 unsigned long charsleft = 0;
1576 char * out;
1577 char * start;
1578
1579 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001580 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001581
Walter Dörwald51ab4142007-05-05 14:43:36 +00001582 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001583 if (v == NULL)
1584 return NULL;
1585
Walter Dörwald51ab4142007-05-05 14:43:36 +00001586 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001587 for (;i < size; ++i) {
1588 Py_UNICODE ch = s[i];
1589
1590 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001591 if (ch == '+') {
1592 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001593 *out++ = '-';
1594 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1595 charsleft = ch;
1596 bitsleft = 16;
1597 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001598 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001599 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001600 } else {
1601 *out++ = (char) ch;
1602 }
1603 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001604 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1605 *out++ = B64(charsleft << (6-bitsleft));
1606 charsleft = 0;
1607 bitsleft = 0;
1608 /* Characters not in the BASE64 set implicitly unshift the sequence
1609 so no '-' is required, except if the character is itself a '-' */
1610 if (B64CHAR(ch) || ch == '-') {
1611 *out++ = '-';
1612 }
1613 inShift = 0;
1614 *out++ = (char) ch;
1615 } else {
1616 bitsleft += 16;
1617 charsleft = (charsleft << 16) | ch;
1618 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1619
1620 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001621 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001622 or '-' then the shift sequence will be terminated implicitly and we
1623 don't have to insert a '-'. */
1624
1625 if (bitsleft == 0) {
1626 if (i + 1 < size) {
1627 Py_UNICODE ch2 = s[i+1];
1628
1629 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001630
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001631 } else if (B64CHAR(ch2) || ch2 == '-') {
1632 *out++ = '-';
1633 inShift = 0;
1634 } else {
1635 inShift = 0;
1636 }
1637
1638 }
1639 else {
1640 *out++ = '-';
1641 inShift = 0;
1642 }
1643 }
Tim Petersced69f82003-09-16 20:30:58 +00001644 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001645 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001646 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001647 if (bitsleft) {
1648 *out++= B64(charsleft << (6-bitsleft) );
1649 *out++ = '-';
1650 }
1651
Walter Dörwald51ab4142007-05-05 14:43:36 +00001652 if (PyBytes_Resize(v, out - start)) {
1653 Py_DECREF(v);
1654 return NULL;
1655 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001656 return v;
1657}
1658
1659#undef SPECIAL
1660#undef B64
1661#undef B64CHAR
1662#undef UB64
1663#undef ENCODE
1664#undef DECODE
1665
Guido van Rossumd57fd912000-03-10 22:53:23 +00001666/* --- UTF-8 Codec -------------------------------------------------------- */
1667
Tim Petersced69f82003-09-16 20:30:58 +00001668static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669char utf8_code_length[256] = {
1670 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1671 illegal prefix. see RFC 2279 for details */
1672 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1673 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1674 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1675 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1676 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1677 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1678 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1679 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1680 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1681 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1682 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1683 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1684 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1685 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1686 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1687 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1688};
1689
Guido van Rossumd57fd912000-03-10 22:53:23 +00001690PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001691 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 const char *errors)
1693{
Walter Dörwald69652032004-09-07 20:24:22 +00001694 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1695}
1696
1697PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001698 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001699 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001700 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001701{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001702 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001704 Py_ssize_t startinpos;
1705 Py_ssize_t endinpos;
1706 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707 const char *e;
1708 PyUnicodeObject *unicode;
1709 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001710 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001711 PyObject *errorHandler = NULL;
1712 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713
1714 /* Note: size will always be longer than the resulting Unicode
1715 character count */
1716 unicode = _PyUnicode_New(size);
1717 if (!unicode)
1718 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001719 if (size == 0) {
1720 if (consumed)
1721 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724
1725 /* Unpack UTF-8 encoded data */
1726 p = unicode->str;
1727 e = s + size;
1728
1729 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001730 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731
1732 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001733 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734 s++;
1735 continue;
1736 }
1737
1738 n = utf8_code_length[ch];
1739
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001740 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001741 if (consumed)
1742 break;
1743 else {
1744 errmsg = "unexpected end of data";
1745 startinpos = s-starts;
1746 endinpos = size;
1747 goto utf8Error;
1748 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001749 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750
1751 switch (n) {
1752
1753 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001754 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001755 startinpos = s-starts;
1756 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001757 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758
1759 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001760 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001761 startinpos = s-starts;
1762 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001763 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764
1765 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001766 if ((s[1] & 0xc0) != 0x80) {
1767 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001768 startinpos = s-starts;
1769 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001770 goto utf8Error;
1771 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001773 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001774 startinpos = s-starts;
1775 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001776 errmsg = "illegal encoding";
1777 goto utf8Error;
1778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001780 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001781 break;
1782
1783 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001784 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001785 (s[2] & 0xc0) != 0x80) {
1786 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001787 startinpos = s-starts;
1788 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001789 goto utf8Error;
1790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001792 if (ch < 0x0800) {
1793 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001794 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001795
1796 XXX For wide builds (UCS-4) we should probably try
1797 to recombine the surrogates into a single code
1798 unit.
1799 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001800 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001801 startinpos = s-starts;
1802 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001803 goto utf8Error;
1804 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001806 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001807 break;
1808
1809 case 4:
1810 if ((s[1] & 0xc0) != 0x80 ||
1811 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001812 (s[3] & 0xc0) != 0x80) {
1813 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001814 startinpos = s-starts;
1815 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001816 goto utf8Error;
1817 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001818 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1819 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1820 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001821 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001822 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001823 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001824 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001825 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001826 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001827 startinpos = s-starts;
1828 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001829 goto utf8Error;
1830 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001831#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001832 *p++ = (Py_UNICODE)ch;
1833#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001834 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001835
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001836 /* translate from 10000..10FFFF to 0..FFFF */
1837 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001838
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001839 /* high surrogate = top 10 bits added to D800 */
1840 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001841
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001842 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001843 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001844#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 break;
1846
1847 default:
1848 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001849 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001850 startinpos = s-starts;
1851 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001852 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001853 }
1854 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001855 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001856
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001857 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 outpos = p-PyUnicode_AS_UNICODE(unicode);
1859 if (unicode_decode_call_errorhandler(
1860 errors, &errorHandler,
1861 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001862 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 (PyObject **)&unicode, &outpos, &p))
1864 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865 }
Walter Dörwald69652032004-09-07 20:24:22 +00001866 if (consumed)
1867 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868
1869 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001870 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 goto onError;
1872
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001873 Py_XDECREF(errorHandler);
1874 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875 return (PyObject *)unicode;
1876
1877onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001878 Py_XDECREF(errorHandler);
1879 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880 Py_DECREF(unicode);
1881 return NULL;
1882}
1883
Tim Peters602f7402002-04-27 18:03:26 +00001884/* Allocation strategy: if the string is short, convert into a stack buffer
1885 and allocate exactly as much space needed at the end. Else allocate the
1886 maximum possible needed (4 result bytes per Unicode character), and return
1887 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001888*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001889PyObject *
1890PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001891 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001892 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001893{
Tim Peters602f7402002-04-27 18:03:26 +00001894#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001895
Martin v. Löwis18e16552006-02-15 17:27:45 +00001896 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001897 PyObject *v; /* result string object */
1898 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001899 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001900 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001901 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001902
Tim Peters602f7402002-04-27 18:03:26 +00001903 assert(s != NULL);
1904 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905
Tim Peters602f7402002-04-27 18:03:26 +00001906 if (size <= MAX_SHORT_UNICHARS) {
1907 /* Write into the stack buffer; nallocated can't overflow.
1908 * At the end, we'll allocate exactly as much heap space as it
1909 * turns out we need.
1910 */
1911 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1912 v = NULL; /* will allocate after we're done */
1913 p = stackbuf;
1914 }
1915 else {
1916 /* Overallocate on the heap, and give the excess back at the end. */
1917 nallocated = size * 4;
1918 if (nallocated / 4 != size) /* overflow! */
1919 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001920 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001921 if (v == NULL)
1922 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001923 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001924 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001925
Tim Peters602f7402002-04-27 18:03:26 +00001926 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001927 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001928
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001929 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001930 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001931 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001932
Guido van Rossumd57fd912000-03-10 22:53:23 +00001933 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001934 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001935 *p++ = (char)(0xc0 | (ch >> 6));
1936 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001937 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001938 else {
Tim Peters602f7402002-04-27 18:03:26 +00001939 /* Encode UCS2 Unicode ordinals */
1940 if (ch < 0x10000) {
1941 /* Special case: check for high surrogate */
1942 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1943 Py_UCS4 ch2 = s[i];
1944 /* Check for low surrogate and combine the two to
1945 form a UCS4 value */
1946 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001947 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001948 i++;
1949 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001950 }
Tim Peters602f7402002-04-27 18:03:26 +00001951 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001952 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001953 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001954 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1955 *p++ = (char)(0x80 | (ch & 0x3f));
1956 continue;
1957 }
1958encodeUCS4:
1959 /* Encode UCS4 Unicode ordinals */
1960 *p++ = (char)(0xf0 | (ch >> 18));
1961 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1962 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1963 *p++ = (char)(0x80 | (ch & 0x3f));
1964 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001965 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001966
Tim Peters602f7402002-04-27 18:03:26 +00001967 if (v == NULL) {
1968 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001969 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001970 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001971 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001972 }
1973 else {
1974 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001975 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001976 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001977 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001980
Tim Peters602f7402002-04-27 18:03:26 +00001981#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982}
1983
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1985{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986 if (!PyUnicode_Check(unicode)) {
1987 PyErr_BadArgument();
1988 return NULL;
1989 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001990 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1991 PyUnicode_GET_SIZE(unicode),
1992 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001993}
1994
1995/* --- UTF-16 Codec ------------------------------------------------------- */
1996
Tim Peters772747b2001-08-09 22:21:55 +00001997PyObject *
1998PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001999 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002000 const char *errors,
2001 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002002{
Walter Dörwald69652032004-09-07 20:24:22 +00002003 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2004}
2005
2006PyObject *
2007PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002008 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002009 const char *errors,
2010 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002011 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002012{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002013 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002014 Py_ssize_t startinpos;
2015 Py_ssize_t endinpos;
2016 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017 PyUnicodeObject *unicode;
2018 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002019 const unsigned char *q, *e;
2020 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002021 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002022 /* Offsets from q for retrieving byte pairs in the right order. */
2023#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2024 int ihi = 1, ilo = 0;
2025#else
2026 int ihi = 0, ilo = 1;
2027#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002028 PyObject *errorHandler = NULL;
2029 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002030
2031 /* Note: size will always be longer than the resulting Unicode
2032 character count */
2033 unicode = _PyUnicode_New(size);
2034 if (!unicode)
2035 return NULL;
2036 if (size == 0)
2037 return (PyObject *)unicode;
2038
2039 /* Unpack UTF-16 encoded data */
2040 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002041 q = (unsigned char *)s;
2042 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043
2044 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002045 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002047 /* Check for BOM marks (U+FEFF) in the input and adjust current
2048 byte order setting accordingly. In native mode, the leading BOM
2049 mark is skipped, in all other modes, it is copied to the output
2050 stream as-is (giving a ZWNBSP character). */
2051 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002052 if (size >= 2) {
2053 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002054#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002055 if (bom == 0xFEFF) {
2056 q += 2;
2057 bo = -1;
2058 }
2059 else if (bom == 0xFFFE) {
2060 q += 2;
2061 bo = 1;
2062 }
Tim Petersced69f82003-09-16 20:30:58 +00002063#else
Walter Dörwald69652032004-09-07 20:24:22 +00002064 if (bom == 0xFEFF) {
2065 q += 2;
2066 bo = 1;
2067 }
2068 else if (bom == 0xFFFE) {
2069 q += 2;
2070 bo = -1;
2071 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002072#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002073 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002074 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075
Tim Peters772747b2001-08-09 22:21:55 +00002076 if (bo == -1) {
2077 /* force LE */
2078 ihi = 1;
2079 ilo = 0;
2080 }
2081 else if (bo == 1) {
2082 /* force BE */
2083 ihi = 0;
2084 ilo = 1;
2085 }
2086
2087 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002088 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002089 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002090 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002091 if (consumed)
2092 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002093 errmsg = "truncated data";
2094 startinpos = ((const char *)q)-starts;
2095 endinpos = ((const char *)e)-starts;
2096 goto utf16Error;
2097 /* The remaining input chars are ignored if the callback
2098 chooses to skip the input */
2099 }
2100 ch = (q[ihi] << 8) | q[ilo];
2101
Tim Peters772747b2001-08-09 22:21:55 +00002102 q += 2;
2103
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104 if (ch < 0xD800 || ch > 0xDFFF) {
2105 *p++ = ch;
2106 continue;
2107 }
2108
2109 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002110 if (q >= e) {
2111 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002112 startinpos = (((const char *)q)-2)-starts;
2113 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002114 goto utf16Error;
2115 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002116 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002117 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2118 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002119 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002120#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002121 *p++ = ch;
2122 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002123#else
2124 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002125#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002126 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002127 }
2128 else {
2129 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002130 startinpos = (((const char *)q)-4)-starts;
2131 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002132 goto utf16Error;
2133 }
2134
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002136 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002137 startinpos = (((const char *)q)-2)-starts;
2138 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002139 /* Fall through to report the error */
2140
2141 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002142 outpos = p-PyUnicode_AS_UNICODE(unicode);
2143 if (unicode_decode_call_errorhandler(
2144 errors, &errorHandler,
2145 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002146 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002147 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002148 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002149 }
2150
2151 if (byteorder)
2152 *byteorder = bo;
2153
Walter Dörwald69652032004-09-07 20:24:22 +00002154 if (consumed)
2155 *consumed = (const char *)q-starts;
2156
Guido van Rossumd57fd912000-03-10 22:53:23 +00002157 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002158 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159 goto onError;
2160
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002161 Py_XDECREF(errorHandler);
2162 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 return (PyObject *)unicode;
2164
2165onError:
2166 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002167 Py_XDECREF(errorHandler);
2168 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169 return NULL;
2170}
2171
Tim Peters772747b2001-08-09 22:21:55 +00002172PyObject *
2173PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002174 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002175 const char *errors,
2176 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177{
2178 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002179 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002180#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002181 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002182#else
2183 const int pairs = 0;
2184#endif
Tim Peters772747b2001-08-09 22:21:55 +00002185 /* Offsets from p for storing byte pairs in the right order. */
2186#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2187 int ihi = 1, ilo = 0;
2188#else
2189 int ihi = 0, ilo = 1;
2190#endif
2191
2192#define STORECHAR(CH) \
2193 do { \
2194 p[ihi] = ((CH) >> 8) & 0xff; \
2195 p[ilo] = (CH) & 0xff; \
2196 p += 2; \
2197 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002198
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002199#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002200 for (i = pairs = 0; i < size; i++)
2201 if (s[i] >= 0x10000)
2202 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002203#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002204 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002205 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206 if (v == NULL)
2207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002208
Walter Dörwald3cc34522007-05-04 10:48:27 +00002209 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002211 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002212 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002213 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002214
2215 if (byteorder == -1) {
2216 /* force LE */
2217 ihi = 1;
2218 ilo = 0;
2219 }
2220 else if (byteorder == 1) {
2221 /* force BE */
2222 ihi = 0;
2223 ilo = 1;
2224 }
2225
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002226 while (size-- > 0) {
2227 Py_UNICODE ch = *s++;
2228 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002229#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002230 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002231 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2232 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002234#endif
Tim Peters772747b2001-08-09 22:21:55 +00002235 STORECHAR(ch);
2236 if (ch2)
2237 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002240#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241}
2242
2243PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2244{
2245 if (!PyUnicode_Check(unicode)) {
2246 PyErr_BadArgument();
2247 return NULL;
2248 }
2249 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2250 PyUnicode_GET_SIZE(unicode),
2251 NULL,
2252 0);
2253}
2254
2255/* --- Unicode Escape Codec ----------------------------------------------- */
2256
Fredrik Lundh06d12682001-01-24 07:59:11 +00002257static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002258
Guido van Rossumd57fd912000-03-10 22:53:23 +00002259PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002260 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 const char *errors)
2262{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002263 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002264 Py_ssize_t startinpos;
2265 Py_ssize_t endinpos;
2266 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002267 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002269 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002270 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002271 char* message;
2272 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002273 PyObject *errorHandler = NULL;
2274 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002275
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 /* Escaped strings will always be longer than the resulting
2277 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002278 length after conversion to the true value.
2279 (but if the error callback returns a long replacement string
2280 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002281 v = _PyUnicode_New(size);
2282 if (v == NULL)
2283 goto onError;
2284 if (size == 0)
2285 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002286
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002287 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002289
Guido van Rossumd57fd912000-03-10 22:53:23 +00002290 while (s < end) {
2291 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002292 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002293 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002294
2295 /* Non-escape characters are interpreted as Unicode ordinals */
2296 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002297 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298 continue;
2299 }
2300
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002301 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302 /* \ - Escapes */
2303 s++;
2304 switch (*s++) {
2305
2306 /* \x escapes */
2307 case '\n': break;
2308 case '\\': *p++ = '\\'; break;
2309 case '\'': *p++ = '\''; break;
2310 case '\"': *p++ = '\"'; break;
2311 case 'b': *p++ = '\b'; break;
2312 case 'f': *p++ = '\014'; break; /* FF */
2313 case 't': *p++ = '\t'; break;
2314 case 'n': *p++ = '\n'; break;
2315 case 'r': *p++ = '\r'; break;
2316 case 'v': *p++ = '\013'; break; /* VT */
2317 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2318
2319 /* \OOO (octal) escapes */
2320 case '0': case '1': case '2': case '3':
2321 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002322 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002323 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002324 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002325 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002326 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002327 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002328 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002329 break;
2330
Fredrik Lundhccc74732001-02-18 22:13:49 +00002331 /* hex escapes */
2332 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002333 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002334 digits = 2;
2335 message = "truncated \\xXX escape";
2336 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337
Fredrik Lundhccc74732001-02-18 22:13:49 +00002338 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002339 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002340 digits = 4;
2341 message = "truncated \\uXXXX escape";
2342 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002343
Fredrik Lundhccc74732001-02-18 22:13:49 +00002344 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002345 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002346 digits = 8;
2347 message = "truncated \\UXXXXXXXX escape";
2348 hexescape:
2349 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002350 outpos = p-PyUnicode_AS_UNICODE(v);
2351 if (s+digits>end) {
2352 endinpos = size;
2353 if (unicode_decode_call_errorhandler(
2354 errors, &errorHandler,
2355 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002356 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002357 (PyObject **)&v, &outpos, &p))
2358 goto onError;
2359 goto nextByte;
2360 }
2361 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002362 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002363 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002364 endinpos = (s+i+1)-starts;
2365 if (unicode_decode_call_errorhandler(
2366 errors, &errorHandler,
2367 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002368 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002369 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002370 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002371 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002372 }
2373 chr = (chr<<4) & ~0xF;
2374 if (c >= '0' && c <= '9')
2375 chr += c - '0';
2376 else if (c >= 'a' && c <= 'f')
2377 chr += 10 + c - 'a';
2378 else
2379 chr += 10 + c - 'A';
2380 }
2381 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002382 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002383 /* _decoding_error will have already written into the
2384 target buffer. */
2385 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002386 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002387 /* when we get here, chr is a 32-bit unicode character */
2388 if (chr <= 0xffff)
2389 /* UCS-2 character */
2390 *p++ = (Py_UNICODE) chr;
2391 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002392 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002393 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002394#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002395 *p++ = chr;
2396#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002397 chr -= 0x10000L;
2398 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002399 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002400#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002401 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002402 endinpos = s-starts;
2403 outpos = p-PyUnicode_AS_UNICODE(v);
2404 if (unicode_decode_call_errorhandler(
2405 errors, &errorHandler,
2406 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002407 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002408 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002409 goto onError;
2410 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002411 break;
2412
2413 /* \N{name} */
2414 case 'N':
2415 message = "malformed \\N character escape";
2416 if (ucnhash_CAPI == NULL) {
2417 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002418 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002419 m = PyImport_ImportModule("unicodedata");
2420 if (m == NULL)
2421 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002422 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002423 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002424 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002425 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002426 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002427 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002428 if (ucnhash_CAPI == NULL)
2429 goto ucnhashError;
2430 }
2431 if (*s == '{') {
2432 const char *start = s+1;
2433 /* look for the closing brace */
2434 while (*s != '}' && s < end)
2435 s++;
2436 if (s > start && s < end && *s == '}') {
2437 /* found a name. look it up in the unicode database */
2438 message = "unknown Unicode character name";
2439 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002440 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002441 goto store;
2442 }
2443 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002444 endinpos = s-starts;
2445 outpos = p-PyUnicode_AS_UNICODE(v);
2446 if (unicode_decode_call_errorhandler(
2447 errors, &errorHandler,
2448 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002449 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002450 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002451 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002452 break;
2453
2454 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002455 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002456 message = "\\ at end of string";
2457 s--;
2458 endinpos = s-starts;
2459 outpos = p-PyUnicode_AS_UNICODE(v);
2460 if (unicode_decode_call_errorhandler(
2461 errors, &errorHandler,
2462 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002463 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002464 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002465 goto onError;
2466 }
2467 else {
2468 *p++ = '\\';
2469 *p++ = (unsigned char)s[-1];
2470 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002471 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002472 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002473 nextByte:
2474 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002475 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002476 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002477 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002478 Py_XDECREF(errorHandler);
2479 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002481
Fredrik Lundhccc74732001-02-18 22:13:49 +00002482ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002483 PyErr_SetString(
2484 PyExc_UnicodeError,
2485 "\\N escapes not supported (can't load unicodedata module)"
2486 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002487 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002488 Py_XDECREF(errorHandler);
2489 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002490 return NULL;
2491
Fredrik Lundhccc74732001-02-18 22:13:49 +00002492onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002493 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002494 Py_XDECREF(errorHandler);
2495 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496 return NULL;
2497}
2498
2499/* Return a Unicode-Escape string version of the Unicode object.
2500
2501 If quotes is true, the string is enclosed in u"" or u'' quotes as
2502 appropriate.
2503
2504*/
2505
Thomas Wouters477c8d52006-05-27 19:21:47 +00002506Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2507 Py_ssize_t size,
2508 Py_UNICODE ch)
2509{
2510 /* like wcschr, but doesn't stop at NULL characters */
2511
2512 while (size-- > 0) {
2513 if (*s == ch)
2514 return s;
2515 s++;
2516 }
2517
2518 return NULL;
2519}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002520
Walter Dörwald79e913e2007-05-12 11:08:06 +00002521static const char *hexdigits = "0123456789abcdef";
2522
2523PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2524 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525{
2526 PyObject *repr;
2527 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528
Thomas Wouters89f507f2006-12-13 04:49:30 +00002529 /* XXX(nnorwitz): rather than over-allocating, it would be
2530 better to choose a different scheme. Perhaps scan the
2531 first N-chars of the string and allocate based on that size.
2532 */
2533 /* Initial allocation is based on the longest-possible unichr
2534 escape.
2535
2536 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2537 unichr, so in this case it's the longest unichr escape. In
2538 narrow (UTF-16) builds this is five chars per source unichr
2539 since there are two unichrs in the surrogate pair, so in narrow
2540 (UTF-16) builds it's not the longest unichr escape.
2541
2542 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2543 so in the narrow (UTF-16) build case it's the longest unichr
2544 escape.
2545 */
2546
Walter Dörwald79e913e2007-05-12 11:08:06 +00002547 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002548#ifdef Py_UNICODE_WIDE
2549 + 10*size
2550#else
2551 + 6*size
2552#endif
2553 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554 if (repr == NULL)
2555 return NULL;
2556
Walter Dörwald79e913e2007-05-12 11:08:06 +00002557 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002558
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 while (size-- > 0) {
2560 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002561
Walter Dörwald79e913e2007-05-12 11:08:06 +00002562 /* Escape backslashes */
2563 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002564 *p++ = '\\';
2565 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002566 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002567 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002568
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002569#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002570 /* Map 21-bit characters to '\U00xxxxxx' */
2571 else if (ch >= 0x10000) {
2572 *p++ = '\\';
2573 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002574 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2575 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2576 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2577 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2578 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2579 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2580 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2581 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002582 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002583 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002584#else
2585 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002586 else if (ch >= 0xD800 && ch < 0xDC00) {
2587 Py_UNICODE ch2;
2588 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002589
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002590 ch2 = *s++;
2591 size--;
2592 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2593 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2594 *p++ = '\\';
2595 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002596 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2597 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2598 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2599 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2600 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2601 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2602 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2603 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002604 continue;
2605 }
2606 /* Fall through: isolated surrogates are copied as-is */
2607 s--;
2608 size++;
2609 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002610#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002611
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002613 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614 *p++ = '\\';
2615 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002616 *p++ = hexdigits[(ch >> 12) & 0x000F];
2617 *p++ = hexdigits[(ch >> 8) & 0x000F];
2618 *p++ = hexdigits[(ch >> 4) & 0x000F];
2619 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002621
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002622 /* Map special whitespace to '\t', \n', '\r' */
2623 else if (ch == '\t') {
2624 *p++ = '\\';
2625 *p++ = 't';
2626 }
2627 else if (ch == '\n') {
2628 *p++ = '\\';
2629 *p++ = 'n';
2630 }
2631 else if (ch == '\r') {
2632 *p++ = '\\';
2633 *p++ = 'r';
2634 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002635
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002636 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002637 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002638 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002639 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002640 *p++ = hexdigits[(ch >> 4) & 0x000F];
2641 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002642 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002643
Guido van Rossumd57fd912000-03-10 22:53:23 +00002644 /* Copy everything else as-is */
2645 else
2646 *p++ = (char) ch;
2647 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002648
2649 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002650 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2651 Py_DECREF(repr);
2652 return NULL;
2653 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002654 return repr;
2655}
2656
Guido van Rossumd57fd912000-03-10 22:53:23 +00002657PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2658{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002659 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660 if (!PyUnicode_Check(unicode)) {
2661 PyErr_BadArgument();
2662 return NULL;
2663 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002664 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2665 PyUnicode_GET_SIZE(unicode));
2666
2667 if (!s)
2668 return NULL;
2669 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2670 PyBytes_GET_SIZE(s));
2671 Py_DECREF(s);
2672 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002673}
2674
2675/* --- Raw Unicode Escape Codec ------------------------------------------- */
2676
2677PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002678 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679 const char *errors)
2680{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002681 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002682 Py_ssize_t startinpos;
2683 Py_ssize_t endinpos;
2684 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002686 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002687 const char *end;
2688 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002689 PyObject *errorHandler = NULL;
2690 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002691
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692 /* Escaped strings will always be longer than the resulting
2693 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002694 length after conversion to the true value. (But decoding error
2695 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 v = _PyUnicode_New(size);
2697 if (v == NULL)
2698 goto onError;
2699 if (size == 0)
2700 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002701 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702 end = s + size;
2703 while (s < end) {
2704 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002705 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002707 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002708
2709 /* Non-escape characters are interpreted as Unicode ordinals */
2710 if (*s != '\\') {
2711 *p++ = (unsigned char)*s++;
2712 continue;
2713 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002714 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002715
2716 /* \u-escapes are only interpreted iff the number of leading
2717 backslashes if odd */
2718 bs = s;
2719 for (;s < end;) {
2720 if (*s != '\\')
2721 break;
2722 *p++ = (unsigned char)*s++;
2723 }
2724 if (((s - bs) & 1) == 0 ||
2725 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002726 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727 continue;
2728 }
2729 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002730 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 s++;
2732
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002733 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002735 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002736 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 endinpos = s-starts;
2739 if (unicode_decode_call_errorhandler(
2740 errors, &errorHandler,
2741 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002742 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002743 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002745 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746 }
2747 x = (x<<4) & ~0xF;
2748 if (c >= '0' && c <= '9')
2749 x += c - '0';
2750 else if (c >= 'a' && c <= 'f')
2751 x += 10 + c - 'a';
2752 else
2753 x += 10 + c - 'A';
2754 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002755#ifndef Py_UNICODE_WIDE
2756 if (x > 0x10000) {
2757 if (unicode_decode_call_errorhandler(
2758 errors, &errorHandler,
2759 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002760 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002761 (PyObject **)&v, &outpos, &p))
2762 goto onError;
2763 }
2764#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002765 *p++ = x;
2766 nextByte:
2767 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002768 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002769 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002770 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002771 Py_XDECREF(errorHandler);
2772 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002774
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775 onError:
2776 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002777 Py_XDECREF(errorHandler);
2778 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779 return NULL;
2780}
2781
2782PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002783 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784{
2785 PyObject *repr;
2786 char *p;
2787 char *q;
2788
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002789#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002790 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002791#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002792 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002793#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794 if (repr == NULL)
2795 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002796 if (size == 0)
2797 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798
Walter Dörwald711005d2007-05-12 12:03:26 +00002799 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800 while (size-- > 0) {
2801 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002802#ifdef Py_UNICODE_WIDE
2803 /* Map 32-bit characters to '\Uxxxxxxxx' */
2804 if (ch >= 0x10000) {
2805 *p++ = '\\';
2806 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002807 *p++ = hexdigits[(ch >> 28) & 0xf];
2808 *p++ = hexdigits[(ch >> 24) & 0xf];
2809 *p++ = hexdigits[(ch >> 20) & 0xf];
2810 *p++ = hexdigits[(ch >> 16) & 0xf];
2811 *p++ = hexdigits[(ch >> 12) & 0xf];
2812 *p++ = hexdigits[(ch >> 8) & 0xf];
2813 *p++ = hexdigits[(ch >> 4) & 0xf];
2814 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002815 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002816 else
2817#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818 /* Map 16-bit characters to '\uxxxx' */
2819 if (ch >= 256) {
2820 *p++ = '\\';
2821 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002822 *p++ = hexdigits[(ch >> 12) & 0xf];
2823 *p++ = hexdigits[(ch >> 8) & 0xf];
2824 *p++ = hexdigits[(ch >> 4) & 0xf];
2825 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 }
2827 /* Copy everything else as-is */
2828 else
2829 *p++ = (char) ch;
2830 }
2831 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002832 if (PyBytes_Resize(repr, p - q)) {
2833 Py_DECREF(repr);
2834 return NULL;
2835 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 return repr;
2837}
2838
2839PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2840{
Walter Dörwald711005d2007-05-12 12:03:26 +00002841 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002842 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002843 PyErr_BadArgument();
2844 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002846 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2847 PyUnicode_GET_SIZE(unicode));
2848
2849 if (!s)
2850 return NULL;
2851 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2852 PyBytes_GET_SIZE(s));
2853 Py_DECREF(s);
2854 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855}
2856
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002857/* --- Unicode Internal Codec ------------------------------------------- */
2858
2859PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002860 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002861 const char *errors)
2862{
2863 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002864 Py_ssize_t startinpos;
2865 Py_ssize_t endinpos;
2866 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002867 PyUnicodeObject *v;
2868 Py_UNICODE *p;
2869 const char *end;
2870 const char *reason;
2871 PyObject *errorHandler = NULL;
2872 PyObject *exc = NULL;
2873
Neal Norwitzd43069c2006-01-08 01:12:10 +00002874#ifdef Py_UNICODE_WIDE
2875 Py_UNICODE unimax = PyUnicode_GetMax();
2876#endif
2877
Thomas Wouters89f507f2006-12-13 04:49:30 +00002878 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002879 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2880 if (v == NULL)
2881 goto onError;
2882 if (PyUnicode_GetSize((PyObject *)v) == 0)
2883 return (PyObject *)v;
2884 p = PyUnicode_AS_UNICODE(v);
2885 end = s + size;
2886
2887 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002888 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002889 /* We have to sanity check the raw data, otherwise doom looms for
2890 some malformed UCS-4 data. */
2891 if (
2892 #ifdef Py_UNICODE_WIDE
2893 *p > unimax || *p < 0 ||
2894 #endif
2895 end-s < Py_UNICODE_SIZE
2896 )
2897 {
2898 startinpos = s - starts;
2899 if (end-s < Py_UNICODE_SIZE) {
2900 endinpos = end-starts;
2901 reason = "truncated input";
2902 }
2903 else {
2904 endinpos = s - starts + Py_UNICODE_SIZE;
2905 reason = "illegal code point (> 0x10FFFF)";
2906 }
2907 outpos = p - PyUnicode_AS_UNICODE(v);
2908 if (unicode_decode_call_errorhandler(
2909 errors, &errorHandler,
2910 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002911 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002912 (PyObject **)&v, &outpos, &p)) {
2913 goto onError;
2914 }
2915 }
2916 else {
2917 p++;
2918 s += Py_UNICODE_SIZE;
2919 }
2920 }
2921
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002922 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002923 goto onError;
2924 Py_XDECREF(errorHandler);
2925 Py_XDECREF(exc);
2926 return (PyObject *)v;
2927
2928 onError:
2929 Py_XDECREF(v);
2930 Py_XDECREF(errorHandler);
2931 Py_XDECREF(exc);
2932 return NULL;
2933}
2934
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935/* --- Latin-1 Codec ------------------------------------------------------ */
2936
2937PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002938 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939 const char *errors)
2940{
2941 PyUnicodeObject *v;
2942 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002943
Guido van Rossumd57fd912000-03-10 22:53:23 +00002944 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002945 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002946 Py_UNICODE r = *(unsigned char*)s;
2947 return PyUnicode_FromUnicode(&r, 1);
2948 }
2949
Guido van Rossumd57fd912000-03-10 22:53:23 +00002950 v = _PyUnicode_New(size);
2951 if (v == NULL)
2952 goto onError;
2953 if (size == 0)
2954 return (PyObject *)v;
2955 p = PyUnicode_AS_UNICODE(v);
2956 while (size-- > 0)
2957 *p++ = (unsigned char)*s++;
2958 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002959
Guido van Rossumd57fd912000-03-10 22:53:23 +00002960 onError:
2961 Py_XDECREF(v);
2962 return NULL;
2963}
2964
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002965/* create or adjust a UnicodeEncodeError */
2966static void make_encode_exception(PyObject **exceptionObject,
2967 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002968 const Py_UNICODE *unicode, Py_ssize_t size,
2969 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002970 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002972 if (*exceptionObject == NULL) {
2973 *exceptionObject = PyUnicodeEncodeError_Create(
2974 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975 }
2976 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002977 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2978 goto onError;
2979 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2980 goto onError;
2981 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2982 goto onError;
2983 return;
2984 onError:
2985 Py_DECREF(*exceptionObject);
2986 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 }
2988}
2989
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002990/* raises a UnicodeEncodeError */
2991static void raise_encode_exception(PyObject **exceptionObject,
2992 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002993 const Py_UNICODE *unicode, Py_ssize_t size,
2994 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002995 const char *reason)
2996{
2997 make_encode_exception(exceptionObject,
2998 encoding, unicode, size, startpos, endpos, reason);
2999 if (*exceptionObject != NULL)
3000 PyCodec_StrictErrors(*exceptionObject);
3001}
3002
3003/* error handling callback helper:
3004 build arguments, call the callback and check the arguments,
3005 put the result into newpos and return the replacement string, which
3006 has to be freed by the caller */
3007static PyObject *unicode_encode_call_errorhandler(const char *errors,
3008 PyObject **errorHandler,
3009 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003010 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3011 Py_ssize_t startpos, Py_ssize_t endpos,
3012 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003013{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003014 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003015
3016 PyObject *restuple;
3017 PyObject *resunicode;
3018
3019 if (*errorHandler == NULL) {
3020 *errorHandler = PyCodec_LookupError(errors);
3021 if (*errorHandler == NULL)
3022 return NULL;
3023 }
3024
3025 make_encode_exception(exceptionObject,
3026 encoding, unicode, size, startpos, endpos, reason);
3027 if (*exceptionObject == NULL)
3028 return NULL;
3029
3030 restuple = PyObject_CallFunctionObjArgs(
3031 *errorHandler, *exceptionObject, NULL);
3032 if (restuple == NULL)
3033 return NULL;
3034 if (!PyTuple_Check(restuple)) {
3035 PyErr_Format(PyExc_TypeError, &argparse[4]);
3036 Py_DECREF(restuple);
3037 return NULL;
3038 }
3039 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3040 &resunicode, newpos)) {
3041 Py_DECREF(restuple);
3042 return NULL;
3043 }
3044 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003045 *newpos = size+*newpos;
3046 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003047 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003048 Py_DECREF(restuple);
3049 return NULL;
3050 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051 Py_INCREF(resunicode);
3052 Py_DECREF(restuple);
3053 return resunicode;
3054}
3055
3056static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003057 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003058 const char *errors,
3059 int limit)
3060{
3061 /* output object */
3062 PyObject *res;
3063 /* pointers to the beginning and end+1 of input */
3064 const Py_UNICODE *startp = p;
3065 const Py_UNICODE *endp = p + size;
3066 /* pointer to the beginning of the unencodable characters */
3067 /* const Py_UNICODE *badp = NULL; */
3068 /* pointer into the output */
3069 char *str;
3070 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003071 Py_ssize_t respos = 0;
3072 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003073 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3074 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003075 PyObject *errorHandler = NULL;
3076 PyObject *exc = NULL;
3077 /* the following variable is used for caching string comparisons
3078 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3079 int known_errorHandler = -1;
3080
3081 /* allocate enough for a simple encoding without
3082 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003083 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003084 if (res == NULL)
3085 goto onError;
3086 if (size == 0)
3087 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003088 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003089 ressize = size;
3090
3091 while (p<endp) {
3092 Py_UNICODE c = *p;
3093
3094 /* can we encode this? */
3095 if (c<limit) {
3096 /* no overflow check, because we know that the space is enough */
3097 *str++ = (char)c;
3098 ++p;
3099 }
3100 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003101 Py_ssize_t unicodepos = p-startp;
3102 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003103 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003104 Py_ssize_t repsize;
3105 Py_ssize_t newpos;
3106 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003107 Py_UNICODE *uni2;
3108 /* startpos for collecting unencodable chars */
3109 const Py_UNICODE *collstart = p;
3110 const Py_UNICODE *collend = p;
3111 /* find all unecodable characters */
3112 while ((collend < endp) && ((*collend)>=limit))
3113 ++collend;
3114 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3115 if (known_errorHandler==-1) {
3116 if ((errors==NULL) || (!strcmp(errors, "strict")))
3117 known_errorHandler = 1;
3118 else if (!strcmp(errors, "replace"))
3119 known_errorHandler = 2;
3120 else if (!strcmp(errors, "ignore"))
3121 known_errorHandler = 3;
3122 else if (!strcmp(errors, "xmlcharrefreplace"))
3123 known_errorHandler = 4;
3124 else
3125 known_errorHandler = 0;
3126 }
3127 switch (known_errorHandler) {
3128 case 1: /* strict */
3129 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3130 goto onError;
3131 case 2: /* replace */
3132 while (collstart++<collend)
3133 *str++ = '?'; /* fall through */
3134 case 3: /* ignore */
3135 p = collend;
3136 break;
3137 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003138 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003139 /* determine replacement size (temporarily (mis)uses p) */
3140 for (p = collstart, repsize = 0; p < collend; ++p) {
3141 if (*p<10)
3142 repsize += 2+1+1;
3143 else if (*p<100)
3144 repsize += 2+2+1;
3145 else if (*p<1000)
3146 repsize += 2+3+1;
3147 else if (*p<10000)
3148 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003149#ifndef Py_UNICODE_WIDE
3150 else
3151 repsize += 2+5+1;
3152#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003153 else if (*p<100000)
3154 repsize += 2+5+1;
3155 else if (*p<1000000)
3156 repsize += 2+6+1;
3157 else
3158 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003159#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003160 }
3161 requiredsize = respos+repsize+(endp-collend);
3162 if (requiredsize > ressize) {
3163 if (requiredsize<2*ressize)
3164 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003165 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003166 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003167 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003168 ressize = requiredsize;
3169 }
3170 /* generate replacement (temporarily (mis)uses p) */
3171 for (p = collstart; p < collend; ++p) {
3172 str += sprintf(str, "&#%d;", (int)*p);
3173 }
3174 p = collend;
3175 break;
3176 default:
3177 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3178 encoding, reason, startp, size, &exc,
3179 collstart-startp, collend-startp, &newpos);
3180 if (repunicode == NULL)
3181 goto onError;
3182 /* need more space? (at least enough for what we
3183 have+the replacement+the rest of the string, so
3184 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003185 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003186 repsize = PyUnicode_GET_SIZE(repunicode);
3187 requiredsize = respos+repsize+(endp-collend);
3188 if (requiredsize > ressize) {
3189 if (requiredsize<2*ressize)
3190 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003191 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003192 Py_DECREF(repunicode);
3193 goto onError;
3194 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003195 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003196 ressize = requiredsize;
3197 }
3198 /* check if there is anything unencodable in the replacement
3199 and copy it to the output */
3200 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3201 c = *uni2;
3202 if (c >= limit) {
3203 raise_encode_exception(&exc, encoding, startp, size,
3204 unicodepos, unicodepos+1, reason);
3205 Py_DECREF(repunicode);
3206 goto onError;
3207 }
3208 *str = (char)c;
3209 }
3210 p = startp + newpos;
3211 Py_DECREF(repunicode);
3212 }
3213 }
3214 }
3215 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003216 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003217 if (respos<ressize)
3218 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003219 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003220 Py_XDECREF(errorHandler);
3221 Py_XDECREF(exc);
3222 return res;
3223
3224 onError:
3225 Py_XDECREF(res);
3226 Py_XDECREF(errorHandler);
3227 Py_XDECREF(exc);
3228 return NULL;
3229}
3230
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003232 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003233 const char *errors)
3234{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003235 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236}
3237
3238PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3239{
3240 if (!PyUnicode_Check(unicode)) {
3241 PyErr_BadArgument();
3242 return NULL;
3243 }
3244 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3245 PyUnicode_GET_SIZE(unicode),
3246 NULL);
3247}
3248
3249/* --- 7-bit ASCII Codec -------------------------------------------------- */
3250
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003252 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 const char *errors)
3254{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003255 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256 PyUnicodeObject *v;
3257 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003258 Py_ssize_t startinpos;
3259 Py_ssize_t endinpos;
3260 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003261 const char *e;
3262 PyObject *errorHandler = NULL;
3263 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003264
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003266 if (size == 1 && *(unsigned char*)s < 128) {
3267 Py_UNICODE r = *(unsigned char*)s;
3268 return PyUnicode_FromUnicode(&r, 1);
3269 }
Tim Petersced69f82003-09-16 20:30:58 +00003270
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 v = _PyUnicode_New(size);
3272 if (v == NULL)
3273 goto onError;
3274 if (size == 0)
3275 return (PyObject *)v;
3276 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003277 e = s + size;
3278 while (s < e) {
3279 register unsigned char c = (unsigned char)*s;
3280 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003281 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003282 ++s;
3283 }
3284 else {
3285 startinpos = s-starts;
3286 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003287 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003288 if (unicode_decode_call_errorhandler(
3289 errors, &errorHandler,
3290 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003291 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003292 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003294 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003295 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003296 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003297 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003298 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003299 Py_XDECREF(errorHandler);
3300 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003302
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 onError:
3304 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003305 Py_XDECREF(errorHandler);
3306 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307 return NULL;
3308}
3309
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003311 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 const char *errors)
3313{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003314 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315}
3316
3317PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3318{
3319 if (!PyUnicode_Check(unicode)) {
3320 PyErr_BadArgument();
3321 return NULL;
3322 }
3323 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3324 PyUnicode_GET_SIZE(unicode),
3325 NULL);
3326}
3327
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003328#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003329
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003330/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003331
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003332#if SIZEOF_INT < SIZEOF_SSIZE_T
3333#define NEED_RETRY
3334#endif
3335
3336/* XXX This code is limited to "true" double-byte encodings, as
3337 a) it assumes an incomplete character consists of a single byte, and
3338 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3339 encodings, see IsDBCSLeadByteEx documentation. */
3340
3341static int is_dbcs_lead_byte(const char *s, int offset)
3342{
3343 const char *curr = s + offset;
3344
3345 if (IsDBCSLeadByte(*curr)) {
3346 const char *prev = CharPrev(s, curr);
3347 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3348 }
3349 return 0;
3350}
3351
3352/*
3353 * Decode MBCS string into unicode object. If 'final' is set, converts
3354 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3355 */
3356static int decode_mbcs(PyUnicodeObject **v,
3357 const char *s, /* MBCS string */
3358 int size, /* sizeof MBCS string */
3359 int final)
3360{
3361 Py_UNICODE *p;
3362 Py_ssize_t n = 0;
3363 int usize = 0;
3364
3365 assert(size >= 0);
3366
3367 /* Skip trailing lead-byte unless 'final' is set */
3368 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3369 --size;
3370
3371 /* First get the size of the result */
3372 if (size > 0) {
3373 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3374 if (usize == 0) {
3375 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3376 return -1;
3377 }
3378 }
3379
3380 if (*v == NULL) {
3381 /* Create unicode object */
3382 *v = _PyUnicode_New(usize);
3383 if (*v == NULL)
3384 return -1;
3385 }
3386 else {
3387 /* Extend unicode object */
3388 n = PyUnicode_GET_SIZE(*v);
3389 if (_PyUnicode_Resize(v, n + usize) < 0)
3390 return -1;
3391 }
3392
3393 /* Do the conversion */
3394 if (size > 0) {
3395 p = PyUnicode_AS_UNICODE(*v) + n;
3396 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3397 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3398 return -1;
3399 }
3400 }
3401
3402 return size;
3403}
3404
3405PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3406 Py_ssize_t size,
3407 const char *errors,
3408 Py_ssize_t *consumed)
3409{
3410 PyUnicodeObject *v = NULL;
3411 int done;
3412
3413 if (consumed)
3414 *consumed = 0;
3415
3416#ifdef NEED_RETRY
3417 retry:
3418 if (size > INT_MAX)
3419 done = decode_mbcs(&v, s, INT_MAX, 0);
3420 else
3421#endif
3422 done = decode_mbcs(&v, s, (int)size, !consumed);
3423
3424 if (done < 0) {
3425 Py_XDECREF(v);
3426 return NULL;
3427 }
3428
3429 if (consumed)
3430 *consumed += done;
3431
3432#ifdef NEED_RETRY
3433 if (size > INT_MAX) {
3434 s += done;
3435 size -= done;
3436 goto retry;
3437 }
3438#endif
3439
3440 return (PyObject *)v;
3441}
3442
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003443PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003444 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003445 const char *errors)
3446{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003447 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3448}
3449
3450/*
3451 * Convert unicode into string object (MBCS).
3452 * Returns 0 if succeed, -1 otherwise.
3453 */
3454static int encode_mbcs(PyObject **repr,
3455 const Py_UNICODE *p, /* unicode */
3456 int size) /* size of unicode */
3457{
3458 int mbcssize = 0;
3459 Py_ssize_t n = 0;
3460
3461 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003462
3463 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003464 if (size > 0) {
3465 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3466 if (mbcssize == 0) {
3467 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3468 return -1;
3469 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003470 }
3471
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003472 if (*repr == NULL) {
3473 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003474 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003475 if (*repr == NULL)
3476 return -1;
3477 }
3478 else {
3479 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003480 n = PyBytes_Size(*repr);
3481 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003482 return -1;
3483 }
3484
3485 /* Do the conversion */
3486 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003487 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003488 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3489 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3490 return -1;
3491 }
3492 }
3493
3494 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003495}
3496
3497PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003498 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003499 const char *errors)
3500{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003501 PyObject *repr = NULL;
3502 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003503
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003504#ifdef NEED_RETRY
3505 retry:
3506 if (size > INT_MAX)
3507 ret = encode_mbcs(&repr, p, INT_MAX);
3508 else
3509#endif
3510 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003511
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003512 if (ret < 0) {
3513 Py_XDECREF(repr);
3514 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003515 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003516
3517#ifdef NEED_RETRY
3518 if (size > INT_MAX) {
3519 p += INT_MAX;
3520 size -= INT_MAX;
3521 goto retry;
3522 }
3523#endif
3524
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003525 return repr;
3526}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003527
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003528PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3529{
3530 if (!PyUnicode_Check(unicode)) {
3531 PyErr_BadArgument();
3532 return NULL;
3533 }
3534 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3535 PyUnicode_GET_SIZE(unicode),
3536 NULL);
3537}
3538
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003539#undef NEED_RETRY
3540
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003541#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003542
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543/* --- Character Mapping Codec -------------------------------------------- */
3544
Guido van Rossumd57fd912000-03-10 22:53:23 +00003545PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003546 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547 PyObject *mapping,
3548 const char *errors)
3549{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003550 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003551 Py_ssize_t startinpos;
3552 Py_ssize_t endinpos;
3553 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555 PyUnicodeObject *v;
3556 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003557 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558 PyObject *errorHandler = NULL;
3559 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003560 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003561 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003562
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 /* Default to Latin-1 */
3564 if (mapping == NULL)
3565 return PyUnicode_DecodeLatin1(s, size, errors);
3566
3567 v = _PyUnicode_New(size);
3568 if (v == NULL)
3569 goto onError;
3570 if (size == 0)
3571 return (PyObject *)v;
3572 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003574 if (PyUnicode_CheckExact(mapping)) {
3575 mapstring = PyUnicode_AS_UNICODE(mapping);
3576 maplen = PyUnicode_GET_SIZE(mapping);
3577 while (s < e) {
3578 unsigned char ch = *s;
3579 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003580
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003581 if (ch < maplen)
3582 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003584 if (x == 0xfffe) {
3585 /* undefined mapping */
3586 outpos = p-PyUnicode_AS_UNICODE(v);
3587 startinpos = s-starts;
3588 endinpos = startinpos+1;
3589 if (unicode_decode_call_errorhandler(
3590 errors, &errorHandler,
3591 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003592 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003593 (PyObject **)&v, &outpos, &p)) {
3594 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003595 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003596 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003597 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003598 *p++ = x;
3599 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003600 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003601 }
3602 else {
3603 while (s < e) {
3604 unsigned char ch = *s;
3605 PyObject *w, *x;
3606
3607 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3608 w = PyInt_FromLong((long)ch);
3609 if (w == NULL)
3610 goto onError;
3611 x = PyObject_GetItem(mapping, w);
3612 Py_DECREF(w);
3613 if (x == NULL) {
3614 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3615 /* No mapping found means: mapping is undefined. */
3616 PyErr_Clear();
3617 x = Py_None;
3618 Py_INCREF(x);
3619 } else
3620 goto onError;
3621 }
3622
3623 /* Apply mapping */
3624 if (PyInt_Check(x)) {
3625 long value = PyInt_AS_LONG(x);
3626 if (value < 0 || value > 65535) {
3627 PyErr_SetString(PyExc_TypeError,
3628 "character mapping must be in range(65536)");
3629 Py_DECREF(x);
3630 goto onError;
3631 }
3632 *p++ = (Py_UNICODE)value;
3633 }
3634 else if (x == Py_None) {
3635 /* undefined mapping */
3636 outpos = p-PyUnicode_AS_UNICODE(v);
3637 startinpos = s-starts;
3638 endinpos = startinpos+1;
3639 if (unicode_decode_call_errorhandler(
3640 errors, &errorHandler,
3641 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003642 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003643 (PyObject **)&v, &outpos, &p)) {
3644 Py_DECREF(x);
3645 goto onError;
3646 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003647 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003648 continue;
3649 }
3650 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003651 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003652
3653 if (targetsize == 1)
3654 /* 1-1 mapping */
3655 *p++ = *PyUnicode_AS_UNICODE(x);
3656
3657 else if (targetsize > 1) {
3658 /* 1-n mapping */
3659 if (targetsize > extrachars) {
3660 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003661 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3662 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003663 (targetsize << 2);
3664 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003665 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003666 if (_PyUnicode_Resize(&v,
3667 PyUnicode_GET_SIZE(v) + needed) < 0) {
3668 Py_DECREF(x);
3669 goto onError;
3670 }
3671 p = PyUnicode_AS_UNICODE(v) + oldpos;
3672 }
3673 Py_UNICODE_COPY(p,
3674 PyUnicode_AS_UNICODE(x),
3675 targetsize);
3676 p += targetsize;
3677 extrachars -= targetsize;
3678 }
3679 /* 1-0 mapping: skip the character */
3680 }
3681 else {
3682 /* wrong return value */
3683 PyErr_SetString(PyExc_TypeError,
3684 "character mapping must return integer, None or unicode");
3685 Py_DECREF(x);
3686 goto onError;
3687 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003689 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691 }
3692 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003693 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 Py_XDECREF(errorHandler);
3696 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003698
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003700 Py_XDECREF(errorHandler);
3701 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702 Py_XDECREF(v);
3703 return NULL;
3704}
3705
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003706/* Charmap encoding: the lookup table */
3707
3708struct encoding_map{
3709 PyObject_HEAD
3710 unsigned char level1[32];
3711 int count2, count3;
3712 unsigned char level23[1];
3713};
3714
3715static PyObject*
3716encoding_map_size(PyObject *obj, PyObject* args)
3717{
3718 struct encoding_map *map = (struct encoding_map*)obj;
3719 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3720 128*map->count3);
3721}
3722
3723static PyMethodDef encoding_map_methods[] = {
3724 {"size", encoding_map_size, METH_NOARGS,
3725 PyDoc_STR("Return the size (in bytes) of this object") },
3726 { 0 }
3727};
3728
3729static void
3730encoding_map_dealloc(PyObject* o)
3731{
3732 PyObject_FREE(o);
3733}
3734
3735static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003736 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003737 "EncodingMap", /*tp_name*/
3738 sizeof(struct encoding_map), /*tp_basicsize*/
3739 0, /*tp_itemsize*/
3740 /* methods */
3741 encoding_map_dealloc, /*tp_dealloc*/
3742 0, /*tp_print*/
3743 0, /*tp_getattr*/
3744 0, /*tp_setattr*/
3745 0, /*tp_compare*/
3746 0, /*tp_repr*/
3747 0, /*tp_as_number*/
3748 0, /*tp_as_sequence*/
3749 0, /*tp_as_mapping*/
3750 0, /*tp_hash*/
3751 0, /*tp_call*/
3752 0, /*tp_str*/
3753 0, /*tp_getattro*/
3754 0, /*tp_setattro*/
3755 0, /*tp_as_buffer*/
3756 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3757 0, /*tp_doc*/
3758 0, /*tp_traverse*/
3759 0, /*tp_clear*/
3760 0, /*tp_richcompare*/
3761 0, /*tp_weaklistoffset*/
3762 0, /*tp_iter*/
3763 0, /*tp_iternext*/
3764 encoding_map_methods, /*tp_methods*/
3765 0, /*tp_members*/
3766 0, /*tp_getset*/
3767 0, /*tp_base*/
3768 0, /*tp_dict*/
3769 0, /*tp_descr_get*/
3770 0, /*tp_descr_set*/
3771 0, /*tp_dictoffset*/
3772 0, /*tp_init*/
3773 0, /*tp_alloc*/
3774 0, /*tp_new*/
3775 0, /*tp_free*/
3776 0, /*tp_is_gc*/
3777};
3778
3779PyObject*
3780PyUnicode_BuildEncodingMap(PyObject* string)
3781{
3782 Py_UNICODE *decode;
3783 PyObject *result;
3784 struct encoding_map *mresult;
3785 int i;
3786 int need_dict = 0;
3787 unsigned char level1[32];
3788 unsigned char level2[512];
3789 unsigned char *mlevel1, *mlevel2, *mlevel3;
3790 int count2 = 0, count3 = 0;
3791
3792 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3793 PyErr_BadArgument();
3794 return NULL;
3795 }
3796 decode = PyUnicode_AS_UNICODE(string);
3797 memset(level1, 0xFF, sizeof level1);
3798 memset(level2, 0xFF, sizeof level2);
3799
3800 /* If there isn't a one-to-one mapping of NULL to \0,
3801 or if there are non-BMP characters, we need to use
3802 a mapping dictionary. */
3803 if (decode[0] != 0)
3804 need_dict = 1;
3805 for (i = 1; i < 256; i++) {
3806 int l1, l2;
3807 if (decode[i] == 0
3808 #ifdef Py_UNICODE_WIDE
3809 || decode[i] > 0xFFFF
3810 #endif
3811 ) {
3812 need_dict = 1;
3813 break;
3814 }
3815 if (decode[i] == 0xFFFE)
3816 /* unmapped character */
3817 continue;
3818 l1 = decode[i] >> 11;
3819 l2 = decode[i] >> 7;
3820 if (level1[l1] == 0xFF)
3821 level1[l1] = count2++;
3822 if (level2[l2] == 0xFF)
3823 level2[l2] = count3++;
3824 }
3825
3826 if (count2 >= 0xFF || count3 >= 0xFF)
3827 need_dict = 1;
3828
3829 if (need_dict) {
3830 PyObject *result = PyDict_New();
3831 PyObject *key, *value;
3832 if (!result)
3833 return NULL;
3834 for (i = 0; i < 256; i++) {
3835 key = value = NULL;
3836 key = PyInt_FromLong(decode[i]);
3837 value = PyInt_FromLong(i);
3838 if (!key || !value)
3839 goto failed1;
3840 if (PyDict_SetItem(result, key, value) == -1)
3841 goto failed1;
3842 Py_DECREF(key);
3843 Py_DECREF(value);
3844 }
3845 return result;
3846 failed1:
3847 Py_XDECREF(key);
3848 Py_XDECREF(value);
3849 Py_DECREF(result);
3850 return NULL;
3851 }
3852
3853 /* Create a three-level trie */
3854 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3855 16*count2 + 128*count3 - 1);
3856 if (!result)
3857 return PyErr_NoMemory();
3858 PyObject_Init(result, &EncodingMapType);
3859 mresult = (struct encoding_map*)result;
3860 mresult->count2 = count2;
3861 mresult->count3 = count3;
3862 mlevel1 = mresult->level1;
3863 mlevel2 = mresult->level23;
3864 mlevel3 = mresult->level23 + 16*count2;
3865 memcpy(mlevel1, level1, 32);
3866 memset(mlevel2, 0xFF, 16*count2);
3867 memset(mlevel3, 0, 128*count3);
3868 count3 = 0;
3869 for (i = 1; i < 256; i++) {
3870 int o1, o2, o3, i2, i3;
3871 if (decode[i] == 0xFFFE)
3872 /* unmapped character */
3873 continue;
3874 o1 = decode[i]>>11;
3875 o2 = (decode[i]>>7) & 0xF;
3876 i2 = 16*mlevel1[o1] + o2;
3877 if (mlevel2[i2] == 0xFF)
3878 mlevel2[i2] = count3++;
3879 o3 = decode[i] & 0x7F;
3880 i3 = 128*mlevel2[i2] + o3;
3881 mlevel3[i3] = i;
3882 }
3883 return result;
3884}
3885
3886static int
3887encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3888{
3889 struct encoding_map *map = (struct encoding_map*)mapping;
3890 int l1 = c>>11;
3891 int l2 = (c>>7) & 0xF;
3892 int l3 = c & 0x7F;
3893 int i;
3894
3895#ifdef Py_UNICODE_WIDE
3896 if (c > 0xFFFF) {
3897 return -1;
3898 }
3899#endif
3900 if (c == 0)
3901 return 0;
3902 /* level 1*/
3903 i = map->level1[l1];
3904 if (i == 0xFF) {
3905 return -1;
3906 }
3907 /* level 2*/
3908 i = map->level23[16*i+l2];
3909 if (i == 0xFF) {
3910 return -1;
3911 }
3912 /* level 3 */
3913 i = map->level23[16*map->count2 + 128*i + l3];
3914 if (i == 0) {
3915 return -1;
3916 }
3917 return i;
3918}
3919
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003920/* Lookup the character ch in the mapping. If the character
3921 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003922 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003923static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003924{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003925 PyObject *w = PyInt_FromLong((long)c);
3926 PyObject *x;
3927
3928 if (w == NULL)
3929 return NULL;
3930 x = PyObject_GetItem(mapping, w);
3931 Py_DECREF(w);
3932 if (x == NULL) {
3933 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3934 /* No mapping found means: mapping is undefined. */
3935 PyErr_Clear();
3936 x = Py_None;
3937 Py_INCREF(x);
3938 return x;
3939 } else
3940 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003942 else if (x == Py_None)
3943 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003944 else if (PyInt_Check(x)) {
3945 long value = PyInt_AS_LONG(x);
3946 if (value < 0 || value > 255) {
3947 PyErr_SetString(PyExc_TypeError,
3948 "character mapping must be in range(256)");
3949 Py_DECREF(x);
3950 return NULL;
3951 }
3952 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003954 else if (PyString_Check(x))
3955 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003957 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003958 PyErr_Format(PyExc_TypeError,
3959 "character mapping must return integer, None or str8, not %.400s",
3960 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961 Py_DECREF(x);
3962 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963 }
3964}
3965
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003966static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003967charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003968{
Walter Dörwald827b0552007-05-12 13:23:53 +00003969 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003970 /* exponentially overallocate to minimize reallocations */
3971 if (requiredsize < 2*outsize)
3972 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003973 if (PyBytes_Resize(outobj, requiredsize)) {
3974 Py_DECREF(outobj);
3975 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003976 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003977 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003978}
3979
3980typedef enum charmapencode_result {
3981 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3982}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003983/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003984 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 space is available. Return a new reference to the object that
3986 was put in the output buffer, or Py_None, if the mapping was undefined
3987 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003988 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003989static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003990charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003991 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003993 PyObject *rep;
3994 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00003995 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003996
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003997 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003998 int res = encoding_map_lookup(c, mapping);
3999 Py_ssize_t requiredsize = *outpos+1;
4000 if (res == -1)
4001 return enc_FAILED;
4002 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004003 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004004 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004005 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004006 outstart[(*outpos)++] = (char)res;
4007 return enc_SUCCESS;
4008 }
4009
4010 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004012 return enc_EXCEPTION;
4013 else if (rep==Py_None) {
4014 Py_DECREF(rep);
4015 return enc_FAILED;
4016 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004018 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004019 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004020 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004021 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004022 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004023 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004024 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4026 }
4027 else {
4028 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004029 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4030 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004031 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004032 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004033 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004034 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004035 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004036 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037 memcpy(outstart + *outpos, repchars, repsize);
4038 *outpos += repsize;
4039 }
4040 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004041 Py_DECREF(rep);
4042 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004043}
4044
4045/* handle an error in PyUnicode_EncodeCharmap
4046 Return 0 on success, -1 on error */
4047static
4048int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004049 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004050 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004051 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004052 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004053{
4054 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004055 Py_ssize_t repsize;
4056 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004057 Py_UNICODE *uni2;
4058 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004059 Py_ssize_t collstartpos = *inpos;
4060 Py_ssize_t collendpos = *inpos+1;
4061 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062 char *encoding = "charmap";
4063 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004064 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004066 /* find all unencodable characters */
4067 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004068 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004069 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004070 int res = encoding_map_lookup(p[collendpos], mapping);
4071 if (res != -1)
4072 break;
4073 ++collendpos;
4074 continue;
4075 }
4076
4077 rep = charmapencode_lookup(p[collendpos], mapping);
4078 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004080 else if (rep!=Py_None) {
4081 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004082 break;
4083 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004084 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004085 ++collendpos;
4086 }
4087 /* cache callback name lookup
4088 * (if not done yet, i.e. it's the first error) */
4089 if (*known_errorHandler==-1) {
4090 if ((errors==NULL) || (!strcmp(errors, "strict")))
4091 *known_errorHandler = 1;
4092 else if (!strcmp(errors, "replace"))
4093 *known_errorHandler = 2;
4094 else if (!strcmp(errors, "ignore"))
4095 *known_errorHandler = 3;
4096 else if (!strcmp(errors, "xmlcharrefreplace"))
4097 *known_errorHandler = 4;
4098 else
4099 *known_errorHandler = 0;
4100 }
4101 switch (*known_errorHandler) {
4102 case 1: /* strict */
4103 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4104 return -1;
4105 case 2: /* replace */
4106 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4107 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004108 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004109 return -1;
4110 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004111 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004112 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4113 return -1;
4114 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004115 }
4116 /* fall through */
4117 case 3: /* ignore */
4118 *inpos = collendpos;
4119 break;
4120 case 4: /* xmlcharrefreplace */
4121 /* generate replacement (temporarily (mis)uses p) */
4122 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4123 char buffer[2+29+1+1];
4124 char *cp;
4125 sprintf(buffer, "&#%d;", (int)p[collpos]);
4126 for (cp = buffer; *cp; ++cp) {
4127 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004128 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004129 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004130 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004131 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4132 return -1;
4133 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134 }
4135 }
4136 *inpos = collendpos;
4137 break;
4138 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004139 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004140 encoding, reason, p, size, exceptionObject,
4141 collstartpos, collendpos, &newpos);
4142 if (repunicode == NULL)
4143 return -1;
4144 /* generate replacement */
4145 repsize = PyUnicode_GET_SIZE(repunicode);
4146 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4147 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004148 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 return -1;
4150 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004151 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004152 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004153 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4154 return -1;
4155 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156 }
4157 *inpos = newpos;
4158 Py_DECREF(repunicode);
4159 }
4160 return 0;
4161}
4162
Guido van Rossumd57fd912000-03-10 22:53:23 +00004163PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004164 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165 PyObject *mapping,
4166 const char *errors)
4167{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004168 /* output object */
4169 PyObject *res = NULL;
4170 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004171 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004173 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004174 PyObject *errorHandler = NULL;
4175 PyObject *exc = NULL;
4176 /* the following variable is used for caching string comparisons
4177 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4178 * 3=ignore, 4=xmlcharrefreplace */
4179 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180
4181 /* Default to Latin-1 */
4182 if (mapping == NULL)
4183 return PyUnicode_EncodeLatin1(p, size, errors);
4184
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185 /* allocate enough for a simple encoding without
4186 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004187 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188 if (res == NULL)
4189 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004190 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 while (inpos<size) {
4194 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004195 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004196 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004198 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199 if (charmap_encoding_error(p, size, &inpos, mapping,
4200 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004201 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004202 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004203 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004204 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 else
4207 /* done with this character => adjust input position */
4208 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004210
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004212 if (respos<PyBytes_GET_SIZE(res)) {
4213 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004214 goto onError;
4215 }
4216 Py_XDECREF(exc);
4217 Py_XDECREF(errorHandler);
4218 return res;
4219
4220 onError:
4221 Py_XDECREF(res);
4222 Py_XDECREF(exc);
4223 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004224 return NULL;
4225}
4226
4227PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4228 PyObject *mapping)
4229{
4230 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4231 PyErr_BadArgument();
4232 return NULL;
4233 }
4234 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4235 PyUnicode_GET_SIZE(unicode),
4236 mapping,
4237 NULL);
4238}
4239
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004240/* create or adjust a UnicodeTranslateError */
4241static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004242 const Py_UNICODE *unicode, Py_ssize_t size,
4243 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004245{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004246 if (*exceptionObject == NULL) {
4247 *exceptionObject = PyUnicodeTranslateError_Create(
4248 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249 }
4250 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4252 goto onError;
4253 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4254 goto onError;
4255 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4256 goto onError;
4257 return;
4258 onError:
4259 Py_DECREF(*exceptionObject);
4260 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004261 }
4262}
4263
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004264/* raises a UnicodeTranslateError */
4265static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004266 const Py_UNICODE *unicode, Py_ssize_t size,
4267 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268 const char *reason)
4269{
4270 make_translate_exception(exceptionObject,
4271 unicode, size, startpos, endpos, reason);
4272 if (*exceptionObject != NULL)
4273 PyCodec_StrictErrors(*exceptionObject);
4274}
4275
4276/* error handling callback helper:
4277 build arguments, call the callback and check the arguments,
4278 put the result into newpos and return the replacement string, which
4279 has to be freed by the caller */
4280static PyObject *unicode_translate_call_errorhandler(const char *errors,
4281 PyObject **errorHandler,
4282 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004283 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4284 Py_ssize_t startpos, Py_ssize_t endpos,
4285 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004287 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004289 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 PyObject *restuple;
4291 PyObject *resunicode;
4292
4293 if (*errorHandler == NULL) {
4294 *errorHandler = PyCodec_LookupError(errors);
4295 if (*errorHandler == NULL)
4296 return NULL;
4297 }
4298
4299 make_translate_exception(exceptionObject,
4300 unicode, size, startpos, endpos, reason);
4301 if (*exceptionObject == NULL)
4302 return NULL;
4303
4304 restuple = PyObject_CallFunctionObjArgs(
4305 *errorHandler, *exceptionObject, NULL);
4306 if (restuple == NULL)
4307 return NULL;
4308 if (!PyTuple_Check(restuple)) {
4309 PyErr_Format(PyExc_TypeError, &argparse[4]);
4310 Py_DECREF(restuple);
4311 return NULL;
4312 }
4313 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004314 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004315 Py_DECREF(restuple);
4316 return NULL;
4317 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004318 if (i_newpos<0)
4319 *newpos = size+i_newpos;
4320 else
4321 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004322 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004323 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004324 Py_DECREF(restuple);
4325 return NULL;
4326 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004327 Py_INCREF(resunicode);
4328 Py_DECREF(restuple);
4329 return resunicode;
4330}
4331
4332/* Lookup the character ch in the mapping and put the result in result,
4333 which must be decrefed by the caller.
4334 Return 0 on success, -1 on error */
4335static
4336int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4337{
4338 PyObject *w = PyInt_FromLong((long)c);
4339 PyObject *x;
4340
4341 if (w == NULL)
4342 return -1;
4343 x = PyObject_GetItem(mapping, w);
4344 Py_DECREF(w);
4345 if (x == NULL) {
4346 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4347 /* No mapping found means: use 1:1 mapping. */
4348 PyErr_Clear();
4349 *result = NULL;
4350 return 0;
4351 } else
4352 return -1;
4353 }
4354 else if (x == Py_None) {
4355 *result = x;
4356 return 0;
4357 }
4358 else if (PyInt_Check(x)) {
4359 long value = PyInt_AS_LONG(x);
4360 long max = PyUnicode_GetMax();
4361 if (value < 0 || value > max) {
4362 PyErr_Format(PyExc_TypeError,
4363 "character mapping must be in range(0x%lx)", max+1);
4364 Py_DECREF(x);
4365 return -1;
4366 }
4367 *result = x;
4368 return 0;
4369 }
4370 else if (PyUnicode_Check(x)) {
4371 *result = x;
4372 return 0;
4373 }
4374 else {
4375 /* wrong return value */
4376 PyErr_SetString(PyExc_TypeError,
4377 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004378 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004379 return -1;
4380 }
4381}
4382/* ensure that *outobj is at least requiredsize characters long,
4383if not reallocate and adjust various state variables.
4384Return 0 on success, -1 on error */
4385static
Walter Dörwald4894c302003-10-24 14:25:28 +00004386int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004387 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004388{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004389 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004390 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004392 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004394 if (requiredsize < 2 * oldsize)
4395 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004396 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004397 return -1;
4398 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004399 }
4400 return 0;
4401}
4402/* lookup the character, put the result in the output string and adjust
4403 various state variables. Return a new reference to the object that
4404 was put in the output buffer in *result, or Py_None, if the mapping was
4405 undefined (in which case no character was written).
4406 The called must decref result.
4407 Return 0 on success, -1 on error. */
4408static
Walter Dörwald4894c302003-10-24 14:25:28 +00004409int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004410 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004411 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412{
Walter Dörwald4894c302003-10-24 14:25:28 +00004413 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004414 return -1;
4415 if (*res==NULL) {
4416 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004417 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 }
4419 else if (*res==Py_None)
4420 ;
4421 else if (PyInt_Check(*res)) {
4422 /* no overflow check, because we know that the space is enough */
4423 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4424 }
4425 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004426 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 if (repsize==1) {
4428 /* no overflow check, because we know that the space is enough */
4429 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4430 }
4431 else if (repsize!=0) {
4432 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004433 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004434 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004435 repsize - 1;
4436 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437 return -1;
4438 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4439 *outp += repsize;
4440 }
4441 }
4442 else
4443 return -1;
4444 return 0;
4445}
4446
4447PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004448 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449 PyObject *mapping,
4450 const char *errors)
4451{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452 /* output object */
4453 PyObject *res = NULL;
4454 /* pointers to the beginning and end+1 of input */
4455 const Py_UNICODE *startp = p;
4456 const Py_UNICODE *endp = p + size;
4457 /* pointer into the output */
4458 Py_UNICODE *str;
4459 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004460 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461 char *reason = "character maps to <undefined>";
4462 PyObject *errorHandler = NULL;
4463 PyObject *exc = NULL;
4464 /* the following variable is used for caching string comparisons
4465 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4466 * 3=ignore, 4=xmlcharrefreplace */
4467 int known_errorHandler = -1;
4468
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 if (mapping == NULL) {
4470 PyErr_BadArgument();
4471 return NULL;
4472 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004473
4474 /* allocate enough for a simple 1:1 translation without
4475 replacements, if we need more, we'll resize */
4476 res = PyUnicode_FromUnicode(NULL, size);
4477 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004478 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004480 return res;
4481 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004483 while (p<endp) {
4484 /* try to encode it */
4485 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004486 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004487 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488 goto onError;
4489 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004490 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004491 if (x!=Py_None) /* it worked => adjust input pointer */
4492 ++p;
4493 else { /* untranslatable character */
4494 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004495 Py_ssize_t repsize;
4496 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004497 Py_UNICODE *uni2;
4498 /* startpos for collecting untranslatable chars */
4499 const Py_UNICODE *collstart = p;
4500 const Py_UNICODE *collend = p+1;
4501 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004503 /* find all untranslatable characters */
4504 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004505 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506 goto onError;
4507 Py_XDECREF(x);
4508 if (x!=Py_None)
4509 break;
4510 ++collend;
4511 }
4512 /* cache callback name lookup
4513 * (if not done yet, i.e. it's the first error) */
4514 if (known_errorHandler==-1) {
4515 if ((errors==NULL) || (!strcmp(errors, "strict")))
4516 known_errorHandler = 1;
4517 else if (!strcmp(errors, "replace"))
4518 known_errorHandler = 2;
4519 else if (!strcmp(errors, "ignore"))
4520 known_errorHandler = 3;
4521 else if (!strcmp(errors, "xmlcharrefreplace"))
4522 known_errorHandler = 4;
4523 else
4524 known_errorHandler = 0;
4525 }
4526 switch (known_errorHandler) {
4527 case 1: /* strict */
4528 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4529 goto onError;
4530 case 2: /* replace */
4531 /* No need to check for space, this is a 1:1 replacement */
4532 for (coll = collstart; coll<collend; ++coll)
4533 *str++ = '?';
4534 /* fall through */
4535 case 3: /* ignore */
4536 p = collend;
4537 break;
4538 case 4: /* xmlcharrefreplace */
4539 /* generate replacement (temporarily (mis)uses p) */
4540 for (p = collstart; p < collend; ++p) {
4541 char buffer[2+29+1+1];
4542 char *cp;
4543 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004544 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004545 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4546 goto onError;
4547 for (cp = buffer; *cp; ++cp)
4548 *str++ = *cp;
4549 }
4550 p = collend;
4551 break;
4552 default:
4553 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4554 reason, startp, size, &exc,
4555 collstart-startp, collend-startp, &newpos);
4556 if (repunicode == NULL)
4557 goto onError;
4558 /* generate replacement */
4559 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004560 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4562 Py_DECREF(repunicode);
4563 goto onError;
4564 }
4565 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4566 *str++ = *uni2;
4567 p = startp + newpos;
4568 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004569 }
4570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004571 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 /* Resize if we allocated to much */
4573 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004574 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004575 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004576 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577 }
4578 Py_XDECREF(exc);
4579 Py_XDECREF(errorHandler);
4580 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004581
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582 onError:
4583 Py_XDECREF(res);
4584 Py_XDECREF(exc);
4585 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586 return NULL;
4587}
4588
4589PyObject *PyUnicode_Translate(PyObject *str,
4590 PyObject *mapping,
4591 const char *errors)
4592{
4593 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004594
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595 str = PyUnicode_FromObject(str);
4596 if (str == NULL)
4597 goto onError;
4598 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4599 PyUnicode_GET_SIZE(str),
4600 mapping,
4601 errors);
4602 Py_DECREF(str);
4603 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004604
Guido van Rossumd57fd912000-03-10 22:53:23 +00004605 onError:
4606 Py_XDECREF(str);
4607 return NULL;
4608}
Tim Petersced69f82003-09-16 20:30:58 +00004609
Guido van Rossum9e896b32000-04-05 20:11:21 +00004610/* --- Decimal Encoder ---------------------------------------------------- */
4611
4612int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004613 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004614 char *output,
4615 const char *errors)
4616{
4617 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618 PyObject *errorHandler = NULL;
4619 PyObject *exc = NULL;
4620 const char *encoding = "decimal";
4621 const char *reason = "invalid decimal Unicode string";
4622 /* the following variable is used for caching string comparisons
4623 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4624 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004625
4626 if (output == NULL) {
4627 PyErr_BadArgument();
4628 return -1;
4629 }
4630
4631 p = s;
4632 end = s + length;
4633 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004634 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004635 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004637 Py_ssize_t repsize;
4638 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004639 Py_UNICODE *uni2;
4640 Py_UNICODE *collstart;
4641 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004642
Guido van Rossum9e896b32000-04-05 20:11:21 +00004643 if (Py_UNICODE_ISSPACE(ch)) {
4644 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004645 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004646 continue;
4647 }
4648 decimal = Py_UNICODE_TODECIMAL(ch);
4649 if (decimal >= 0) {
4650 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004651 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004652 continue;
4653 }
Guido van Rossumba477042000-04-06 18:18:10 +00004654 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004655 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004656 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004657 continue;
4658 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 /* All other characters are considered unencodable */
4660 collstart = p;
4661 collend = p+1;
4662 while (collend < end) {
4663 if ((0 < *collend && *collend < 256) ||
4664 !Py_UNICODE_ISSPACE(*collend) ||
4665 Py_UNICODE_TODECIMAL(*collend))
4666 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004667 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004668 /* cache callback name lookup
4669 * (if not done yet, i.e. it's the first error) */
4670 if (known_errorHandler==-1) {
4671 if ((errors==NULL) || (!strcmp(errors, "strict")))
4672 known_errorHandler = 1;
4673 else if (!strcmp(errors, "replace"))
4674 known_errorHandler = 2;
4675 else if (!strcmp(errors, "ignore"))
4676 known_errorHandler = 3;
4677 else if (!strcmp(errors, "xmlcharrefreplace"))
4678 known_errorHandler = 4;
4679 else
4680 known_errorHandler = 0;
4681 }
4682 switch (known_errorHandler) {
4683 case 1: /* strict */
4684 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4685 goto onError;
4686 case 2: /* replace */
4687 for (p = collstart; p < collend; ++p)
4688 *output++ = '?';
4689 /* fall through */
4690 case 3: /* ignore */
4691 p = collend;
4692 break;
4693 case 4: /* xmlcharrefreplace */
4694 /* generate replacement (temporarily (mis)uses p) */
4695 for (p = collstart; p < collend; ++p)
4696 output += sprintf(output, "&#%d;", (int)*p);
4697 p = collend;
4698 break;
4699 default:
4700 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4701 encoding, reason, s, length, &exc,
4702 collstart-s, collend-s, &newpos);
4703 if (repunicode == NULL)
4704 goto onError;
4705 /* generate replacement */
4706 repsize = PyUnicode_GET_SIZE(repunicode);
4707 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4708 Py_UNICODE ch = *uni2;
4709 if (Py_UNICODE_ISSPACE(ch))
4710 *output++ = ' ';
4711 else {
4712 decimal = Py_UNICODE_TODECIMAL(ch);
4713 if (decimal >= 0)
4714 *output++ = '0' + decimal;
4715 else if (0 < ch && ch < 256)
4716 *output++ = (char)ch;
4717 else {
4718 Py_DECREF(repunicode);
4719 raise_encode_exception(&exc, encoding,
4720 s, length, collstart-s, collend-s, reason);
4721 goto onError;
4722 }
4723 }
4724 }
4725 p = s + newpos;
4726 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004727 }
4728 }
4729 /* 0-terminate the output string */
4730 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004731 Py_XDECREF(exc);
4732 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004733 return 0;
4734
4735 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004736 Py_XDECREF(exc);
4737 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004738 return -1;
4739}
4740
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741/* --- Helpers ------------------------------------------------------------ */
4742
Thomas Wouters477c8d52006-05-27 19:21:47 +00004743#define STRINGLIB_CHAR Py_UNICODE
4744
4745#define STRINGLIB_LEN PyUnicode_GET_SIZE
4746#define STRINGLIB_NEW PyUnicode_FromUnicode
4747#define STRINGLIB_STR PyUnicode_AS_UNICODE
4748
4749Py_LOCAL_INLINE(int)
4750STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004752 if (str[0] != other[0])
4753 return 1;
4754 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755}
4756
Thomas Wouters477c8d52006-05-27 19:21:47 +00004757#define STRINGLIB_EMPTY unicode_empty
4758
4759#include "stringlib/fastsearch.h"
4760
4761#include "stringlib/count.h"
4762#include "stringlib/find.h"
4763#include "stringlib/partition.h"
4764
4765/* helper macro to fixup start/end slice values */
4766#define FIX_START_END(obj) \
4767 if (start < 0) \
4768 start += (obj)->length; \
4769 if (start < 0) \
4770 start = 0; \
4771 if (end > (obj)->length) \
4772 end = (obj)->length; \
4773 if (end < 0) \
4774 end += (obj)->length; \
4775 if (end < 0) \
4776 end = 0;
4777
Martin v. Löwis18e16552006-02-15 17:27:45 +00004778Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004779 PyObject *substr,
4780 Py_ssize_t start,
4781 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004783 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004784 PyUnicodeObject* str_obj;
4785 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004786
Thomas Wouters477c8d52006-05-27 19:21:47 +00004787 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4788 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004790 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4791 if (!sub_obj) {
4792 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793 return -1;
4794 }
Tim Petersced69f82003-09-16 20:30:58 +00004795
Thomas Wouters477c8d52006-05-27 19:21:47 +00004796 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004797
Thomas Wouters477c8d52006-05-27 19:21:47 +00004798 result = stringlib_count(
4799 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4800 );
4801
4802 Py_DECREF(sub_obj);
4803 Py_DECREF(str_obj);
4804
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805 return result;
4806}
4807
Martin v. Löwis18e16552006-02-15 17:27:45 +00004808Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004809 PyObject *sub,
4810 Py_ssize_t start,
4811 Py_ssize_t end,
4812 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004814 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004815
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004817 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004818 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004819 sub = PyUnicode_FromObject(sub);
4820 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004821 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004822 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823 }
Tim Petersced69f82003-09-16 20:30:58 +00004824
Thomas Wouters477c8d52006-05-27 19:21:47 +00004825 if (direction > 0)
4826 result = stringlib_find_slice(
4827 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4828 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4829 start, end
4830 );
4831 else
4832 result = stringlib_rfind_slice(
4833 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4834 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4835 start, end
4836 );
4837
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004839 Py_DECREF(sub);
4840
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 return result;
4842}
4843
Tim Petersced69f82003-09-16 20:30:58 +00004844static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845int tailmatch(PyUnicodeObject *self,
4846 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004847 Py_ssize_t start,
4848 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849 int direction)
4850{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851 if (substring->length == 0)
4852 return 1;
4853
Thomas Wouters477c8d52006-05-27 19:21:47 +00004854 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855
4856 end -= substring->length;
4857 if (end < start)
4858 return 0;
4859
4860 if (direction > 0) {
4861 if (Py_UNICODE_MATCH(self, end, substring))
4862 return 1;
4863 } else {
4864 if (Py_UNICODE_MATCH(self, start, substring))
4865 return 1;
4866 }
4867
4868 return 0;
4869}
4870
Martin v. Löwis18e16552006-02-15 17:27:45 +00004871Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004873 Py_ssize_t start,
4874 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 int direction)
4876{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004877 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004878
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879 str = PyUnicode_FromObject(str);
4880 if (str == NULL)
4881 return -1;
4882 substr = PyUnicode_FromObject(substr);
4883 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004884 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885 return -1;
4886 }
Tim Petersced69f82003-09-16 20:30:58 +00004887
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888 result = tailmatch((PyUnicodeObject *)str,
4889 (PyUnicodeObject *)substr,
4890 start, end, direction);
4891 Py_DECREF(str);
4892 Py_DECREF(substr);
4893 return result;
4894}
4895
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896/* Apply fixfct filter to the Unicode object self and return a
4897 reference to the modified object */
4898
Tim Petersced69f82003-09-16 20:30:58 +00004899static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900PyObject *fixup(PyUnicodeObject *self,
4901 int (*fixfct)(PyUnicodeObject *s))
4902{
4903
4904 PyUnicodeObject *u;
4905
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004906 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907 if (u == NULL)
4908 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004909
4910 Py_UNICODE_COPY(u->str, self->str, self->length);
4911
Tim Peters7a29bd52001-09-12 03:03:31 +00004912 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913 /* fixfct should return TRUE if it modified the buffer. If
4914 FALSE, return a reference to the original buffer instead
4915 (to save space, not time) */
4916 Py_INCREF(self);
4917 Py_DECREF(u);
4918 return (PyObject*) self;
4919 }
4920 return (PyObject*) u;
4921}
4922
Tim Petersced69f82003-09-16 20:30:58 +00004923static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924int fixupper(PyUnicodeObject *self)
4925{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004926 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927 Py_UNICODE *s = self->str;
4928 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004929
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930 while (len-- > 0) {
4931 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004932
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933 ch = Py_UNICODE_TOUPPER(*s);
4934 if (ch != *s) {
4935 status = 1;
4936 *s = ch;
4937 }
4938 s++;
4939 }
4940
4941 return status;
4942}
4943
Tim Petersced69f82003-09-16 20:30:58 +00004944static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945int fixlower(PyUnicodeObject *self)
4946{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004947 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948 Py_UNICODE *s = self->str;
4949 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004950
Guido van Rossumd57fd912000-03-10 22:53:23 +00004951 while (len-- > 0) {
4952 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004953
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954 ch = Py_UNICODE_TOLOWER(*s);
4955 if (ch != *s) {
4956 status = 1;
4957 *s = ch;
4958 }
4959 s++;
4960 }
4961
4962 return status;
4963}
4964
Tim Petersced69f82003-09-16 20:30:58 +00004965static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966int fixswapcase(PyUnicodeObject *self)
4967{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004968 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004969 Py_UNICODE *s = self->str;
4970 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004971
Guido van Rossumd57fd912000-03-10 22:53:23 +00004972 while (len-- > 0) {
4973 if (Py_UNICODE_ISUPPER(*s)) {
4974 *s = Py_UNICODE_TOLOWER(*s);
4975 status = 1;
4976 } else if (Py_UNICODE_ISLOWER(*s)) {
4977 *s = Py_UNICODE_TOUPPER(*s);
4978 status = 1;
4979 }
4980 s++;
4981 }
4982
4983 return status;
4984}
4985
Tim Petersced69f82003-09-16 20:30:58 +00004986static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987int fixcapitalize(PyUnicodeObject *self)
4988{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004989 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004990 Py_UNICODE *s = self->str;
4991 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004992
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004993 if (len == 0)
4994 return 0;
4995 if (Py_UNICODE_ISLOWER(*s)) {
4996 *s = Py_UNICODE_TOUPPER(*s);
4997 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004999 s++;
5000 while (--len > 0) {
5001 if (Py_UNICODE_ISUPPER(*s)) {
5002 *s = Py_UNICODE_TOLOWER(*s);
5003 status = 1;
5004 }
5005 s++;
5006 }
5007 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005008}
5009
5010static
5011int fixtitle(PyUnicodeObject *self)
5012{
5013 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5014 register Py_UNICODE *e;
5015 int previous_is_cased;
5016
5017 /* Shortcut for single character strings */
5018 if (PyUnicode_GET_SIZE(self) == 1) {
5019 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5020 if (*p != ch) {
5021 *p = ch;
5022 return 1;
5023 }
5024 else
5025 return 0;
5026 }
Tim Petersced69f82003-09-16 20:30:58 +00005027
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028 e = p + PyUnicode_GET_SIZE(self);
5029 previous_is_cased = 0;
5030 for (; p < e; p++) {
5031 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005032
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033 if (previous_is_cased)
5034 *p = Py_UNICODE_TOLOWER(ch);
5035 else
5036 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005037
5038 if (Py_UNICODE_ISLOWER(ch) ||
5039 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005040 Py_UNICODE_ISTITLE(ch))
5041 previous_is_cased = 1;
5042 else
5043 previous_is_cased = 0;
5044 }
5045 return 1;
5046}
5047
Tim Peters8ce9f162004-08-27 01:49:32 +00005048PyObject *
5049PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050{
Tim Peters8ce9f162004-08-27 01:49:32 +00005051 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005052 const Py_UNICODE blank = ' ';
5053 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005054 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005055 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005056 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5057 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005058 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5059 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005060 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005061 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005062 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063
Tim Peters05eba1f2004-08-27 21:32:02 +00005064 fseq = PySequence_Fast(seq, "");
5065 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005066 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005067 }
5068
Tim Peters91879ab2004-08-27 22:35:44 +00005069 /* Grrrr. A codec may be invoked to convert str objects to
5070 * Unicode, and so it's possible to call back into Python code
5071 * during PyUnicode_FromObject(), and so it's possible for a sick
5072 * codec to change the size of fseq (if seq is a list). Therefore
5073 * we have to keep refetching the size -- can't assume seqlen
5074 * is invariant.
5075 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005076 seqlen = PySequence_Fast_GET_SIZE(fseq);
5077 /* If empty sequence, return u"". */
5078 if (seqlen == 0) {
5079 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5080 goto Done;
5081 }
5082 /* If singleton sequence with an exact Unicode, return that. */
5083 if (seqlen == 1) {
5084 item = PySequence_Fast_GET_ITEM(fseq, 0);
5085 if (PyUnicode_CheckExact(item)) {
5086 Py_INCREF(item);
5087 res = (PyUnicodeObject *)item;
5088 goto Done;
5089 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005090 }
5091
Tim Peters05eba1f2004-08-27 21:32:02 +00005092 /* At least two items to join, or one that isn't exact Unicode. */
5093 if (seqlen > 1) {
5094 /* Set up sep and seplen -- they're needed. */
5095 if (separator == NULL) {
5096 sep = &blank;
5097 seplen = 1;
5098 }
5099 else {
5100 internal_separator = PyUnicode_FromObject(separator);
5101 if (internal_separator == NULL)
5102 goto onError;
5103 sep = PyUnicode_AS_UNICODE(internal_separator);
5104 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005105 /* In case PyUnicode_FromObject() mutated seq. */
5106 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005107 }
5108 }
5109
5110 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005111 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005112 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005113 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005114 res_p = PyUnicode_AS_UNICODE(res);
5115 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005116
Tim Peters05eba1f2004-08-27 21:32:02 +00005117 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005118 Py_ssize_t itemlen;
5119 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005120
5121 item = PySequence_Fast_GET_ITEM(fseq, i);
5122 /* Convert item to Unicode. */
5123 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5124 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005125 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005126 " %.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00005127 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005128 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005129 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005130 item = PyUnicode_FromObject(item);
5131 if (item == NULL)
5132 goto onError;
5133 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005134
Tim Peters91879ab2004-08-27 22:35:44 +00005135 /* In case PyUnicode_FromObject() mutated seq. */
5136 seqlen = PySequence_Fast_GET_SIZE(fseq);
5137
Tim Peters8ce9f162004-08-27 01:49:32 +00005138 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005140 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005141 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005142 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005143 if (i < seqlen - 1) {
5144 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005145 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005146 goto Overflow;
5147 }
5148 if (new_res_used > res_alloc) {
5149 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005150 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005151 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005152 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005153 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005154 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005155 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005156 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005158 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005159 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005160 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005161
5162 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005163 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005164 res_p += itemlen;
5165 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005166 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005167 res_p += seplen;
5168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005170 res_used = new_res_used;
5171 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005172
Tim Peters05eba1f2004-08-27 21:32:02 +00005173 /* Shrink res to match the used area; this probably can't fail,
5174 * but it's cheap to check.
5175 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005176 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005177 goto onError;
5178
5179 Done:
5180 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005181 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 return (PyObject *)res;
5183
Tim Peters8ce9f162004-08-27 01:49:32 +00005184 Overflow:
5185 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005186 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005187 Py_DECREF(item);
5188 /* fall through */
5189
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005191 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005192 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005193 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 return NULL;
5195}
5196
Tim Petersced69f82003-09-16 20:30:58 +00005197static
5198PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005199 Py_ssize_t left,
5200 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 Py_UNICODE fill)
5202{
5203 PyUnicodeObject *u;
5204
5205 if (left < 0)
5206 left = 0;
5207 if (right < 0)
5208 right = 0;
5209
Tim Peters7a29bd52001-09-12 03:03:31 +00005210 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211 Py_INCREF(self);
5212 return self;
5213 }
5214
5215 u = _PyUnicode_New(left + self->length + right);
5216 if (u) {
5217 if (left)
5218 Py_UNICODE_FILL(u->str, fill, left);
5219 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5220 if (right)
5221 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5222 }
5223
5224 return u;
5225}
5226
5227#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005228 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 if (!str) \
5230 goto onError; \
5231 if (PyList_Append(list, str)) { \
5232 Py_DECREF(str); \
5233 goto onError; \
5234 } \
5235 else \
5236 Py_DECREF(str);
5237
5238static
5239PyObject *split_whitespace(PyUnicodeObject *self,
5240 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005241 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005243 register Py_ssize_t i;
5244 register Py_ssize_t j;
5245 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246 PyObject *str;
5247
5248 for (i = j = 0; i < len; ) {
5249 /* find a token */
5250 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5251 i++;
5252 j = i;
5253 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5254 i++;
5255 if (j < i) {
5256 if (maxcount-- <= 0)
5257 break;
5258 SPLIT_APPEND(self->str, j, i);
5259 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5260 i++;
5261 j = i;
5262 }
5263 }
5264 if (j < len) {
5265 SPLIT_APPEND(self->str, j, len);
5266 }
5267 return list;
5268
5269 onError:
5270 Py_DECREF(list);
5271 return NULL;
5272}
5273
5274PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005275 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005277 register Py_ssize_t i;
5278 register Py_ssize_t j;
5279 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280 PyObject *list;
5281 PyObject *str;
5282 Py_UNICODE *data;
5283
5284 string = PyUnicode_FromObject(string);
5285 if (string == NULL)
5286 return NULL;
5287 data = PyUnicode_AS_UNICODE(string);
5288 len = PyUnicode_GET_SIZE(string);
5289
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 list = PyList_New(0);
5291 if (!list)
5292 goto onError;
5293
5294 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005295 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005296
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005298 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300
5301 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005302 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 if (i < len) {
5304 if (data[i] == '\r' && i + 1 < len &&
5305 data[i+1] == '\n')
5306 i += 2;
5307 else
5308 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005309 if (keepends)
5310 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311 }
Guido van Rossum86662912000-04-11 15:38:46 +00005312 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 j = i;
5314 }
5315 if (j < len) {
5316 SPLIT_APPEND(data, j, len);
5317 }
5318
5319 Py_DECREF(string);
5320 return list;
5321
5322 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005323 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 Py_DECREF(string);
5325 return NULL;
5326}
5327
Tim Petersced69f82003-09-16 20:30:58 +00005328static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329PyObject *split_char(PyUnicodeObject *self,
5330 PyObject *list,
5331 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005332 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005334 register Py_ssize_t i;
5335 register Py_ssize_t j;
5336 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 PyObject *str;
5338
5339 for (i = j = 0; i < len; ) {
5340 if (self->str[i] == ch) {
5341 if (maxcount-- <= 0)
5342 break;
5343 SPLIT_APPEND(self->str, j, i);
5344 i = j = i + 1;
5345 } else
5346 i++;
5347 }
5348 if (j <= len) {
5349 SPLIT_APPEND(self->str, j, len);
5350 }
5351 return list;
5352
5353 onError:
5354 Py_DECREF(list);
5355 return NULL;
5356}
5357
Tim Petersced69f82003-09-16 20:30:58 +00005358static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359PyObject *split_substring(PyUnicodeObject *self,
5360 PyObject *list,
5361 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005362 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005364 register Py_ssize_t i;
5365 register Py_ssize_t j;
5366 Py_ssize_t len = self->length;
5367 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 PyObject *str;
5369
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005370 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 if (Py_UNICODE_MATCH(self, i, substring)) {
5372 if (maxcount-- <= 0)
5373 break;
5374 SPLIT_APPEND(self->str, j, i);
5375 i = j = i + sublen;
5376 } else
5377 i++;
5378 }
5379 if (j <= len) {
5380 SPLIT_APPEND(self->str, j, len);
5381 }
5382 return list;
5383
5384 onError:
5385 Py_DECREF(list);
5386 return NULL;
5387}
5388
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005389static
5390PyObject *rsplit_whitespace(PyUnicodeObject *self,
5391 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005392 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005393{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005394 register Py_ssize_t i;
5395 register Py_ssize_t j;
5396 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005397 PyObject *str;
5398
5399 for (i = j = len - 1; i >= 0; ) {
5400 /* find a token */
5401 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5402 i--;
5403 j = i;
5404 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5405 i--;
5406 if (j > i) {
5407 if (maxcount-- <= 0)
5408 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005409 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005410 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5411 i--;
5412 j = i;
5413 }
5414 }
5415 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005416 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005417 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005418 if (PyList_Reverse(list) < 0)
5419 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005420 return list;
5421
5422 onError:
5423 Py_DECREF(list);
5424 return NULL;
5425}
5426
5427static
5428PyObject *rsplit_char(PyUnicodeObject *self,
5429 PyObject *list,
5430 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005431 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005432{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005433 register Py_ssize_t i;
5434 register Py_ssize_t j;
5435 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005436 PyObject *str;
5437
5438 for (i = j = len - 1; i >= 0; ) {
5439 if (self->str[i] == ch) {
5440 if (maxcount-- <= 0)
5441 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005442 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005443 j = i = i - 1;
5444 } else
5445 i--;
5446 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005447 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005448 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005449 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005450 if (PyList_Reverse(list) < 0)
5451 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005452 return list;
5453
5454 onError:
5455 Py_DECREF(list);
5456 return NULL;
5457}
5458
5459static
5460PyObject *rsplit_substring(PyUnicodeObject *self,
5461 PyObject *list,
5462 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005463 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005464{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005465 register Py_ssize_t i;
5466 register Py_ssize_t j;
5467 Py_ssize_t len = self->length;
5468 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005469 PyObject *str;
5470
5471 for (i = len - sublen, j = len; i >= 0; ) {
5472 if (Py_UNICODE_MATCH(self, i, substring)) {
5473 if (maxcount-- <= 0)
5474 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005475 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005476 j = i;
5477 i -= sublen;
5478 } else
5479 i--;
5480 }
5481 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005482 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005483 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005484 if (PyList_Reverse(list) < 0)
5485 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005486 return list;
5487
5488 onError:
5489 Py_DECREF(list);
5490 return NULL;
5491}
5492
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493#undef SPLIT_APPEND
5494
5495static
5496PyObject *split(PyUnicodeObject *self,
5497 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005498 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499{
5500 PyObject *list;
5501
5502 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005503 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504
5505 list = PyList_New(0);
5506 if (!list)
5507 return NULL;
5508
5509 if (substring == NULL)
5510 return split_whitespace(self,list,maxcount);
5511
5512 else if (substring->length == 1)
5513 return split_char(self,list,substring->str[0],maxcount);
5514
5515 else if (substring->length == 0) {
5516 Py_DECREF(list);
5517 PyErr_SetString(PyExc_ValueError, "empty separator");
5518 return NULL;
5519 }
5520 else
5521 return split_substring(self,list,substring,maxcount);
5522}
5523
Tim Petersced69f82003-09-16 20:30:58 +00005524static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005525PyObject *rsplit(PyUnicodeObject *self,
5526 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005527 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005528{
5529 PyObject *list;
5530
5531 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005532 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005533
5534 list = PyList_New(0);
5535 if (!list)
5536 return NULL;
5537
5538 if (substring == NULL)
5539 return rsplit_whitespace(self,list,maxcount);
5540
5541 else if (substring->length == 1)
5542 return rsplit_char(self,list,substring->str[0],maxcount);
5543
5544 else if (substring->length == 0) {
5545 Py_DECREF(list);
5546 PyErr_SetString(PyExc_ValueError, "empty separator");
5547 return NULL;
5548 }
5549 else
5550 return rsplit_substring(self,list,substring,maxcount);
5551}
5552
5553static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554PyObject *replace(PyUnicodeObject *self,
5555 PyUnicodeObject *str1,
5556 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005557 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558{
5559 PyUnicodeObject *u;
5560
5561 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005562 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563
Thomas Wouters477c8d52006-05-27 19:21:47 +00005564 if (str1->length == str2->length) {
5565 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005566 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005567 if (str1->length == 1) {
5568 /* replace characters */
5569 Py_UNICODE u1, u2;
5570 if (!findchar(self->str, self->length, str1->str[0]))
5571 goto nothing;
5572 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5573 if (!u)
5574 return NULL;
5575 Py_UNICODE_COPY(u->str, self->str, self->length);
5576 u1 = str1->str[0];
5577 u2 = str2->str[0];
5578 for (i = 0; i < u->length; i++)
5579 if (u->str[i] == u1) {
5580 if (--maxcount < 0)
5581 break;
5582 u->str[i] = u2;
5583 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005585 i = fastsearch(
5586 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005588 if (i < 0)
5589 goto nothing;
5590 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5591 if (!u)
5592 return NULL;
5593 Py_UNICODE_COPY(u->str, self->str, self->length);
5594 while (i <= self->length - str1->length)
5595 if (Py_UNICODE_MATCH(self, i, str1)) {
5596 if (--maxcount < 0)
5597 break;
5598 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5599 i += str1->length;
5600 } else
5601 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005604
5605 Py_ssize_t n, i, j, e;
5606 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607 Py_UNICODE *p;
5608
5609 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005610 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 if (n > maxcount)
5612 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005613 if (n == 0)
5614 goto nothing;
5615 /* new_size = self->length + n * (str2->length - str1->length)); */
5616 delta = (str2->length - str1->length);
5617 if (delta == 0) {
5618 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005620 product = n * (str2->length - str1->length);
5621 if ((product / (str2->length - str1->length)) != n) {
5622 PyErr_SetString(PyExc_OverflowError,
5623 "replace string is too long");
5624 return NULL;
5625 }
5626 new_size = self->length + product;
5627 if (new_size < 0) {
5628 PyErr_SetString(PyExc_OverflowError,
5629 "replace string is too long");
5630 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 }
5632 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005633 u = _PyUnicode_New(new_size);
5634 if (!u)
5635 return NULL;
5636 i = 0;
5637 p = u->str;
5638 e = self->length - str1->length;
5639 if (str1->length > 0) {
5640 while (n-- > 0) {
5641 /* look for next match */
5642 j = i;
5643 while (j <= e) {
5644 if (Py_UNICODE_MATCH(self, j, str1))
5645 break;
5646 j++;
5647 }
5648 if (j > i) {
5649 if (j > e)
5650 break;
5651 /* copy unchanged part [i:j] */
5652 Py_UNICODE_COPY(p, self->str+i, j-i);
5653 p += j - i;
5654 }
5655 /* copy substitution string */
5656 if (str2->length > 0) {
5657 Py_UNICODE_COPY(p, str2->str, str2->length);
5658 p += str2->length;
5659 }
5660 i = j + str1->length;
5661 }
5662 if (i < self->length)
5663 /* copy tail [i:] */
5664 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5665 } else {
5666 /* interleave */
5667 while (n > 0) {
5668 Py_UNICODE_COPY(p, str2->str, str2->length);
5669 p += str2->length;
5670 if (--n <= 0)
5671 break;
5672 *p++ = self->str[i++];
5673 }
5674 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5675 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005678
5679nothing:
5680 /* nothing to replace; return original string (when possible) */
5681 if (PyUnicode_CheckExact(self)) {
5682 Py_INCREF(self);
5683 return (PyObject *) self;
5684 }
5685 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686}
5687
5688/* --- Unicode Object Methods --------------------------------------------- */
5689
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005690PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691"S.title() -> unicode\n\
5692\n\
5693Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005694characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695
5696static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005697unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 return fixup(self, fixtitle);
5700}
5701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005702PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703"S.capitalize() -> unicode\n\
5704\n\
5705Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005706have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707
5708static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005709unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 return fixup(self, fixcapitalize);
5712}
5713
5714#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005715PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716"S.capwords() -> unicode\n\
5717\n\
5718Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005719normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720
5721static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005722unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723{
5724 PyObject *list;
5725 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005726 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 /* Split into words */
5729 list = split(self, NULL, -1);
5730 if (!list)
5731 return NULL;
5732
5733 /* Capitalize each word */
5734 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5735 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5736 fixcapitalize);
5737 if (item == NULL)
5738 goto onError;
5739 Py_DECREF(PyList_GET_ITEM(list, i));
5740 PyList_SET_ITEM(list, i, item);
5741 }
5742
5743 /* Join the words to form a new string */
5744 item = PyUnicode_Join(NULL, list);
5745
5746onError:
5747 Py_DECREF(list);
5748 return (PyObject *)item;
5749}
5750#endif
5751
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005752/* Argument converter. Coerces to a single unicode character */
5753
5754static int
5755convert_uc(PyObject *obj, void *addr)
5756{
5757 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5758 PyObject *uniobj;
5759 Py_UNICODE *unistr;
5760
5761 uniobj = PyUnicode_FromObject(obj);
5762 if (uniobj == NULL) {
5763 PyErr_SetString(PyExc_TypeError,
5764 "The fill character cannot be converted to Unicode");
5765 return 0;
5766 }
5767 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5768 PyErr_SetString(PyExc_TypeError,
5769 "The fill character must be exactly one character long");
5770 Py_DECREF(uniobj);
5771 return 0;
5772 }
5773 unistr = PyUnicode_AS_UNICODE(uniobj);
5774 *fillcharloc = unistr[0];
5775 Py_DECREF(uniobj);
5776 return 1;
5777}
5778
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005779PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005780"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005782Return S centered in a Unicode string of length width. Padding is\n\
5783done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784
5785static PyObject *
5786unicode_center(PyUnicodeObject *self, PyObject *args)
5787{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005788 Py_ssize_t marg, left;
5789 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005790 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791
Thomas Woutersde017742006-02-16 19:34:37 +00005792 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793 return NULL;
5794
Tim Peters7a29bd52001-09-12 03:03:31 +00005795 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796 Py_INCREF(self);
5797 return (PyObject*) self;
5798 }
5799
5800 marg = width - self->length;
5801 left = marg / 2 + (marg & width & 1);
5802
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005803 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804}
5805
Marc-André Lemburge5034372000-08-08 08:04:29 +00005806#if 0
5807
5808/* This code should go into some future Unicode collation support
5809 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005810 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005811
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005812/* speedy UTF-16 code point order comparison */
5813/* gleaned from: */
5814/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5815
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005816static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005817{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005818 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005819 0, 0, 0, 0, 0, 0, 0, 0,
5820 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005821 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005822};
5823
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824static int
5825unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5826{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005827 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005828
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829 Py_UNICODE *s1 = str1->str;
5830 Py_UNICODE *s2 = str2->str;
5831
5832 len1 = str1->length;
5833 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005834
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005836 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005837
5838 c1 = *s1++;
5839 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005840
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005841 if (c1 > (1<<11) * 26)
5842 c1 += utf16Fixup[c1>>11];
5843 if (c2 > (1<<11) * 26)
5844 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005845 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005846
5847 if (c1 != c2)
5848 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005849
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005850 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 }
5852
5853 return (len1 < len2) ? -1 : (len1 != len2);
5854}
5855
Marc-André Lemburge5034372000-08-08 08:04:29 +00005856#else
5857
5858static int
5859unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5860{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005861 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005862
5863 Py_UNICODE *s1 = str1->str;
5864 Py_UNICODE *s2 = str2->str;
5865
5866 len1 = str1->length;
5867 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005868
Marc-André Lemburge5034372000-08-08 08:04:29 +00005869 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005870 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005871
Fredrik Lundh45714e92001-06-26 16:39:36 +00005872 c1 = *s1++;
5873 c2 = *s2++;
5874
5875 if (c1 != c2)
5876 return (c1 < c2) ? -1 : 1;
5877
Marc-André Lemburge5034372000-08-08 08:04:29 +00005878 len1--; len2--;
5879 }
5880
5881 return (len1 < len2) ? -1 : (len1 != len2);
5882}
5883
5884#endif
5885
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886int PyUnicode_Compare(PyObject *left,
5887 PyObject *right)
5888{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005889 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5890 return unicode_compare((PyUnicodeObject *)left,
5891 (PyUnicodeObject *)right);
5892 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5893 (PyUnicode_Check(left) && PyString_Check(right))) {
5894 if (PyUnicode_Check(left))
5895 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5896 if (PyUnicode_Check(right))
5897 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5898 assert(PyString_Check(left));
5899 assert(PyString_Check(right));
5900 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005902 PyErr_Format(PyExc_TypeError,
5903 "Can't compare %.100s and %.100s",
5904 left->ob_type->tp_name,
5905 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 return -1;
5907}
5908
Martin v. Löwis5b222132007-06-10 09:51:05 +00005909int
5910PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
5911{
5912 int i;
5913 Py_UNICODE *id;
5914 assert(PyUnicode_Check(uni));
5915 id = PyUnicode_AS_UNICODE(uni);
5916 /* Compare Unicode string and source character set string */
5917 for (i = 0; id[i] && str[i]; i++)
5918 if (id[i] != str[i])
5919 return ((int)id[i] < (int)str[i]) ? -1 : 1;
5920 if (id[i])
5921 return 1; /* uni is longer */
5922 if (str[i])
5923 return -1; /* str is longer */
5924 return 0;
5925}
5926
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005927PyObject *PyUnicode_RichCompare(PyObject *left,
5928 PyObject *right,
5929 int op)
5930{
5931 int result;
5932
5933 result = PyUnicode_Compare(left, right);
5934 if (result == -1 && PyErr_Occurred())
5935 goto onError;
5936
5937 /* Convert the return value to a Boolean */
5938 switch (op) {
5939 case Py_EQ:
5940 result = (result == 0);
5941 break;
5942 case Py_NE:
5943 result = (result != 0);
5944 break;
5945 case Py_LE:
5946 result = (result <= 0);
5947 break;
5948 case Py_GE:
5949 result = (result >= 0);
5950 break;
5951 case Py_LT:
5952 result = (result == -1);
5953 break;
5954 case Py_GT:
5955 result = (result == 1);
5956 break;
5957 }
5958 return PyBool_FromLong(result);
5959
5960 onError:
5961
5962 /* Standard case
5963
5964 Type errors mean that PyUnicode_FromObject() could not convert
5965 one of the arguments (usually the right hand side) to Unicode,
5966 ie. we can't handle the comparison request. However, it is
5967 possible that the other object knows a comparison method, which
5968 is why we return Py_NotImplemented to give the other object a
5969 chance.
5970
5971 */
5972 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5973 PyErr_Clear();
5974 Py_INCREF(Py_NotImplemented);
5975 return Py_NotImplemented;
5976 }
5977 if (op != Py_EQ && op != Py_NE)
5978 return NULL;
5979
5980 /* Equality comparison.
5981
5982 This is a special case: we silence any PyExc_UnicodeDecodeError
5983 and instead turn it into a PyErr_UnicodeWarning.
5984
5985 */
5986 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5987 return NULL;
5988 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00005989 if (PyErr_WarnEx(PyExc_UnicodeWarning,
5990 (op == Py_EQ) ?
5991 "Unicode equal comparison "
5992 "failed to convert both arguments to Unicode - "
5993 "interpreting them as being unequal"
5994 :
5995 "Unicode unequal comparison "
5996 "failed to convert both arguments to Unicode - "
5997 "interpreting them as being unequal",
5998 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005999 return NULL;
6000 result = (op == Py_NE);
6001 return PyBool_FromLong(result);
6002}
6003
Guido van Rossum403d68b2000-03-13 15:55:09 +00006004int PyUnicode_Contains(PyObject *container,
6005 PyObject *element)
6006{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006007 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006008 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006009
6010 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006011 sub = PyUnicode_FromObject(element);
6012 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006013 PyErr_Format(PyExc_TypeError,
6014 "'in <string>' requires string as left operand, not %s",
6015 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006016 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006017 }
6018
Thomas Wouters477c8d52006-05-27 19:21:47 +00006019 str = PyUnicode_FromObject(container);
6020 if (!str) {
6021 Py_DECREF(sub);
6022 return -1;
6023 }
6024
6025 result = stringlib_contains_obj(str, sub);
6026
6027 Py_DECREF(str);
6028 Py_DECREF(sub);
6029
Guido van Rossum403d68b2000-03-13 15:55:09 +00006030 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006031}
6032
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033/* Concat to string or Unicode object giving a new Unicode object. */
6034
6035PyObject *PyUnicode_Concat(PyObject *left,
6036 PyObject *right)
6037{
6038 PyUnicodeObject *u = NULL, *v = NULL, *w;
6039
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006040 if (PyBytes_Check(left) || PyBytes_Check(right))
6041 return PyBytes_Concat(left, right);
6042
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043 /* Coerce the two arguments */
6044 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6045 if (u == NULL)
6046 goto onError;
6047 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6048 if (v == NULL)
6049 goto onError;
6050
6051 /* Shortcuts */
6052 if (v == unicode_empty) {
6053 Py_DECREF(v);
6054 return (PyObject *)u;
6055 }
6056 if (u == unicode_empty) {
6057 Py_DECREF(u);
6058 return (PyObject *)v;
6059 }
6060
6061 /* Concat the two Unicode strings */
6062 w = _PyUnicode_New(u->length + v->length);
6063 if (w == NULL)
6064 goto onError;
6065 Py_UNICODE_COPY(w->str, u->str, u->length);
6066 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6067
6068 Py_DECREF(u);
6069 Py_DECREF(v);
6070 return (PyObject *)w;
6071
6072onError:
6073 Py_XDECREF(u);
6074 Py_XDECREF(v);
6075 return NULL;
6076}
6077
Walter Dörwald1ab83302007-05-18 17:15:44 +00006078void
6079PyUnicode_Append(PyObject **pleft, PyObject *right)
6080{
6081 PyObject *new;
6082 if (*pleft == NULL)
6083 return;
6084 if (right == NULL || !PyUnicode_Check(*pleft)) {
6085 Py_DECREF(*pleft);
6086 *pleft = NULL;
6087 return;
6088 }
6089 new = PyUnicode_Concat(*pleft, right);
6090 Py_DECREF(*pleft);
6091 *pleft = new;
6092}
6093
6094void
6095PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6096{
6097 PyUnicode_Append(pleft, right);
6098 Py_XDECREF(right);
6099}
6100
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006101PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102"S.count(sub[, start[, end]]) -> int\n\
6103\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006104Return the number of non-overlapping occurrences of substring sub in\n\
6105Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006106interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107
6108static PyObject *
6109unicode_count(PyUnicodeObject *self, PyObject *args)
6110{
6111 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006112 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006113 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 PyObject *result;
6115
Guido van Rossumb8872e62000-05-09 14:14:27 +00006116 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6117 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 return NULL;
6119
6120 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006121 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 if (substring == NULL)
6123 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006124
Thomas Wouters477c8d52006-05-27 19:21:47 +00006125 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126
Thomas Wouters477c8d52006-05-27 19:21:47 +00006127 result = PyInt_FromSsize_t(
6128 stringlib_count(self->str + start, end - start,
6129 substring->str, substring->length)
6130 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131
6132 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006133
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 return result;
6135}
6136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006137PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006138"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006140Encodes S using the codec registered for encoding. encoding defaults\n\
6141to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006142handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006143a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6144'xmlcharrefreplace' as well as any other name registered with\n\
6145codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146
6147static PyObject *
6148unicode_encode(PyUnicodeObject *self, PyObject *args)
6149{
6150 char *encoding = NULL;
6151 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006152 PyObject *v;
6153
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6155 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006156 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006157 if (v == NULL)
6158 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006159 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006160 if (PyString_Check(v)) {
6161 /* Old codec, turn it into bytes */
6162 PyObject *b = PyBytes_FromObject(v);
6163 Py_DECREF(v);
6164 return b;
6165 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006166 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006167 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006168 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006169 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006170 Py_DECREF(v);
6171 return NULL;
6172 }
6173 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006174
6175 onError:
6176 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006177}
6178
6179PyDoc_STRVAR(decode__doc__,
6180"S.decode([encoding[,errors]]) -> string or unicode\n\
6181\n\
6182Decodes S using the codec registered for encoding. encoding defaults\n\
6183to the default encoding. errors may be given to set a different error\n\
6184handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6185a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6186as well as any other name registerd with codecs.register_error that is\n\
6187able to handle UnicodeDecodeErrors.");
6188
6189static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006190unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006191{
6192 char *encoding = NULL;
6193 char *errors = NULL;
6194 PyObject *v;
6195
6196 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6197 return NULL;
6198 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006199 if (v == NULL)
6200 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006201 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6202 PyErr_Format(PyExc_TypeError,
6203 "decoder did not return a string/unicode object "
6204 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006205 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006206 Py_DECREF(v);
6207 return NULL;
6208 }
6209 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006210
6211 onError:
6212 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213}
6214
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006215PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216"S.expandtabs([tabsize]) -> unicode\n\
6217\n\
6218Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006219If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220
6221static PyObject*
6222unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6223{
6224 Py_UNICODE *e;
6225 Py_UNICODE *p;
6226 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006227 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 PyUnicodeObject *u;
6229 int tabsize = 8;
6230
6231 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6232 return NULL;
6233
Thomas Wouters7e474022000-07-16 12:04:32 +00006234 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006235 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 e = self->str + self->length;
6237 for (p = self->str; p < e; p++)
6238 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006239 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006241 if (old_j > j) {
6242 PyErr_SetString(PyExc_OverflowError,
6243 "new string is too long");
6244 return NULL;
6245 }
6246 old_j = j;
6247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248 }
6249 else {
6250 j++;
6251 if (*p == '\n' || *p == '\r') {
6252 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006253 old_j = j = 0;
6254 if (i < 0) {
6255 PyErr_SetString(PyExc_OverflowError,
6256 "new string is too long");
6257 return NULL;
6258 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259 }
6260 }
6261
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006262 if ((i + j) < 0) {
6263 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6264 return NULL;
6265 }
6266
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267 /* Second pass: create output string and fill it */
6268 u = _PyUnicode_New(i + j);
6269 if (!u)
6270 return NULL;
6271
6272 j = 0;
6273 q = u->str;
6274
6275 for (p = self->str; p < e; p++)
6276 if (*p == '\t') {
6277 if (tabsize > 0) {
6278 i = tabsize - (j % tabsize);
6279 j += i;
6280 while (i--)
6281 *q++ = ' ';
6282 }
6283 }
6284 else {
6285 j++;
6286 *q++ = *p;
6287 if (*p == '\n' || *p == '\r')
6288 j = 0;
6289 }
6290
6291 return (PyObject*) u;
6292}
6293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006294PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295"S.find(sub [,start [,end]]) -> int\n\
6296\n\
6297Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006298such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299arguments start and end are interpreted as in slice notation.\n\
6300\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006301Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302
6303static PyObject *
6304unicode_find(PyUnicodeObject *self, PyObject *args)
6305{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006306 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006307 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006308 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006309 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310
Guido van Rossumb8872e62000-05-09 14:14:27 +00006311 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6312 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006314 substring = PyUnicode_FromObject(substring);
6315 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 return NULL;
6317
Thomas Wouters477c8d52006-05-27 19:21:47 +00006318 result = stringlib_find_slice(
6319 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6320 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6321 start, end
6322 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323
6324 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006325
6326 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327}
6328
6329static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006330unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331{
6332 if (index < 0 || index >= self->length) {
6333 PyErr_SetString(PyExc_IndexError, "string index out of range");
6334 return NULL;
6335 }
6336
6337 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6338}
6339
6340static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006341unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006343 /* Since Unicode objects compare equal to their UTF-8 string
6344 counterparts, we hash the UTF-8 string. */
6345 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6346 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347}
6348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006349PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350"S.index(sub [,start [,end]]) -> int\n\
6351\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006352Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353
6354static PyObject *
6355unicode_index(PyUnicodeObject *self, PyObject *args)
6356{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006357 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006358 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006359 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006360 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361
Guido van Rossumb8872e62000-05-09 14:14:27 +00006362 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6363 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006365 substring = PyUnicode_FromObject(substring);
6366 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367 return NULL;
6368
Thomas Wouters477c8d52006-05-27 19:21:47 +00006369 result = stringlib_find_slice(
6370 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6371 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6372 start, end
6373 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374
6375 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006376
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377 if (result < 0) {
6378 PyErr_SetString(PyExc_ValueError, "substring not found");
6379 return NULL;
6380 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006381
Martin v. Löwis18e16552006-02-15 17:27:45 +00006382 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383}
6384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006385PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006386"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006388Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006389at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390
6391static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006392unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393{
6394 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6395 register const Py_UNICODE *e;
6396 int cased;
6397
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398 /* Shortcut for single character strings */
6399 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006400 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006402 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006403 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006404 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006405
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 e = p + PyUnicode_GET_SIZE(self);
6407 cased = 0;
6408 for (; p < e; p++) {
6409 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006410
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006412 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 else if (!cased && Py_UNICODE_ISLOWER(ch))
6414 cased = 1;
6415 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006416 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417}
6418
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006419PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006420"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006422Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006423at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424
6425static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006426unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427{
6428 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6429 register const Py_UNICODE *e;
6430 int cased;
6431
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 /* Shortcut for single character strings */
6433 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006434 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006436 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006437 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006438 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006439
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 e = p + PyUnicode_GET_SIZE(self);
6441 cased = 0;
6442 for (; p < e; p++) {
6443 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006444
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006446 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 else if (!cased && Py_UNICODE_ISUPPER(ch))
6448 cased = 1;
6449 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006450 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451}
6452
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006453PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006454"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006456Return True if S is a titlecased string and there is at least one\n\
6457character in S, i.e. upper- and titlecase characters may only\n\
6458follow uncased characters and lowercase characters only cased ones.\n\
6459Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460
6461static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006462unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463{
6464 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6465 register const Py_UNICODE *e;
6466 int cased, previous_is_cased;
6467
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468 /* Shortcut for single character strings */
6469 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006470 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6471 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006473 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006474 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006475 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006476
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477 e = p + PyUnicode_GET_SIZE(self);
6478 cased = 0;
6479 previous_is_cased = 0;
6480 for (; p < e; p++) {
6481 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006482
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6484 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006485 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 previous_is_cased = 1;
6487 cased = 1;
6488 }
6489 else if (Py_UNICODE_ISLOWER(ch)) {
6490 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006491 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492 previous_is_cased = 1;
6493 cased = 1;
6494 }
6495 else
6496 previous_is_cased = 0;
6497 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006498 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499}
6500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006501PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006502"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006504Return True if all characters in S are whitespace\n\
6505and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506
6507static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006508unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509{
6510 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6511 register const Py_UNICODE *e;
6512
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513 /* Shortcut for single character strings */
6514 if (PyUnicode_GET_SIZE(self) == 1 &&
6515 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006516 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006518 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006519 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006520 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006521
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 e = p + PyUnicode_GET_SIZE(self);
6523 for (; p < e; p++) {
6524 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006525 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006527 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528}
6529
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006530PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006531"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006532\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006533Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006534and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006535
6536static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006537unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006538{
6539 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6540 register const Py_UNICODE *e;
6541
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006542 /* Shortcut for single character strings */
6543 if (PyUnicode_GET_SIZE(self) == 1 &&
6544 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006545 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006546
6547 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006548 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006549 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006550
6551 e = p + PyUnicode_GET_SIZE(self);
6552 for (; p < e; p++) {
6553 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006554 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006555 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006556 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006557}
6558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006559PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006560"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006561\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006562Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006563and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006564
6565static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006566unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006567{
6568 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6569 register const Py_UNICODE *e;
6570
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006571 /* Shortcut for single character strings */
6572 if (PyUnicode_GET_SIZE(self) == 1 &&
6573 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006574 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006575
6576 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006577 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006578 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006579
6580 e = p + PyUnicode_GET_SIZE(self);
6581 for (; p < e; p++) {
6582 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006583 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006584 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006585 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006586}
6587
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006588PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006589"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006591Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006592False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593
6594static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006595unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596{
6597 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6598 register const Py_UNICODE *e;
6599
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 /* Shortcut for single character strings */
6601 if (PyUnicode_GET_SIZE(self) == 1 &&
6602 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006603 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006605 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006606 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006607 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006608
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 e = p + PyUnicode_GET_SIZE(self);
6610 for (; p < e; p++) {
6611 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006612 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006614 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615}
6616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006617PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006618"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006620Return True if all characters in S are digits\n\
6621and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622
6623static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006624unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625{
6626 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6627 register const Py_UNICODE *e;
6628
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 /* Shortcut for single character strings */
6630 if (PyUnicode_GET_SIZE(self) == 1 &&
6631 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006632 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006634 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006635 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006636 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006637
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638 e = p + PyUnicode_GET_SIZE(self);
6639 for (; p < e; p++) {
6640 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006641 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006643 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644}
6645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006646PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006647"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006649Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006650False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651
6652static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006653unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654{
6655 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6656 register const Py_UNICODE *e;
6657
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658 /* Shortcut for single character strings */
6659 if (PyUnicode_GET_SIZE(self) == 1 &&
6660 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006661 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006663 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006664 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006665 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006666
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 e = p + PyUnicode_GET_SIZE(self);
6668 for (; p < e; p++) {
6669 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006670 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006672 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673}
6674
Martin v. Löwis47383402007-08-15 07:32:56 +00006675int
6676PyUnicode_IsIdentifier(PyObject *self)
6677{
6678 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6679 register const Py_UNICODE *e;
6680
6681 /* Special case for empty strings */
6682 if (PyUnicode_GET_SIZE(self) == 0)
6683 return 0;
6684
6685 /* PEP 3131 says that the first character must be in
6686 XID_Start and subsequent characters in XID_Continue,
6687 and for the ASCII range, the 2.x rules apply (i.e
6688 start with letters and underscore, continue with
6689 letters, digits, underscore). However, given the current
6690 definition of XID_Start and XID_Continue, it is sufficient
6691 to check just for these, except that _ must be allowed
6692 as starting an identifier. */
6693 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
6694 return 0;
6695
6696 e = p + PyUnicode_GET_SIZE(self);
6697 for (p++; p < e; p++) {
6698 if (!_PyUnicode_IsXidContinue(*p))
6699 return 0;
6700 }
6701 return 1;
6702}
6703
6704PyDoc_STRVAR(isidentifier__doc__,
6705"S.isidentifier() -> bool\n\
6706\n\
6707Return True if S is a valid identifier according\n\
6708to the language definition.");
6709
6710static PyObject*
6711unicode_isidentifier(PyObject *self)
6712{
6713 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
6714}
6715
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006716PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717"S.join(sequence) -> unicode\n\
6718\n\
6719Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006720sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721
6722static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006723unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006725 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726}
6727
Martin v. Löwis18e16552006-02-15 17:27:45 +00006728static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729unicode_length(PyUnicodeObject *self)
6730{
6731 return self->length;
6732}
6733
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006734PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006735"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736\n\
6737Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006738done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739
6740static PyObject *
6741unicode_ljust(PyUnicodeObject *self, PyObject *args)
6742{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006743 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006744 Py_UNICODE fillchar = ' ';
6745
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006746 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 return NULL;
6748
Tim Peters7a29bd52001-09-12 03:03:31 +00006749 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 Py_INCREF(self);
6751 return (PyObject*) self;
6752 }
6753
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006754 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755}
6756
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006757PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758"S.lower() -> unicode\n\
6759\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006760Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761
6762static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006763unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765 return fixup(self, fixlower);
6766}
6767
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006768#define LEFTSTRIP 0
6769#define RIGHTSTRIP 1
6770#define BOTHSTRIP 2
6771
6772/* Arrays indexed by above */
6773static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6774
6775#define STRIPNAME(i) (stripformat[i]+3)
6776
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006777/* externally visible for str.strip(unicode) */
6778PyObject *
6779_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6780{
6781 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006782 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006783 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006784 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6785 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006786
Thomas Wouters477c8d52006-05-27 19:21:47 +00006787 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6788
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006789 i = 0;
6790 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006791 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6792 i++;
6793 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006794 }
6795
6796 j = len;
6797 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006798 do {
6799 j--;
6800 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6801 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006802 }
6803
6804 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006805 Py_INCREF(self);
6806 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006807 }
6808 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006809 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006810}
6811
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812
6813static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006814do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006816 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006817 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006818
6819 i = 0;
6820 if (striptype != RIGHTSTRIP) {
6821 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6822 i++;
6823 }
6824 }
6825
6826 j = len;
6827 if (striptype != LEFTSTRIP) {
6828 do {
6829 j--;
6830 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6831 j++;
6832 }
6833
6834 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6835 Py_INCREF(self);
6836 return (PyObject*)self;
6837 }
6838 else
6839 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840}
6841
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006842
6843static PyObject *
6844do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6845{
6846 PyObject *sep = NULL;
6847
6848 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6849 return NULL;
6850
6851 if (sep != NULL && sep != Py_None) {
6852 if (PyUnicode_Check(sep))
6853 return _PyUnicode_XStrip(self, striptype, sep);
6854 else if (PyString_Check(sep)) {
6855 PyObject *res;
6856 sep = PyUnicode_FromObject(sep);
6857 if (sep==NULL)
6858 return NULL;
6859 res = _PyUnicode_XStrip(self, striptype, sep);
6860 Py_DECREF(sep);
6861 return res;
6862 }
6863 else {
6864 PyErr_Format(PyExc_TypeError,
6865 "%s arg must be None, unicode or str",
6866 STRIPNAME(striptype));
6867 return NULL;
6868 }
6869 }
6870
6871 return do_strip(self, striptype);
6872}
6873
6874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006875PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006876"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006877\n\
6878Return a copy of the string S with leading and trailing\n\
6879whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006880If chars is given and not None, remove characters in chars instead.\n\
6881If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006882
6883static PyObject *
6884unicode_strip(PyUnicodeObject *self, PyObject *args)
6885{
6886 if (PyTuple_GET_SIZE(args) == 0)
6887 return do_strip(self, BOTHSTRIP); /* Common case */
6888 else
6889 return do_argstrip(self, BOTHSTRIP, args);
6890}
6891
6892
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006893PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006894"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006895\n\
6896Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006897If chars is given and not None, remove characters in chars instead.\n\
6898If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006899
6900static PyObject *
6901unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6902{
6903 if (PyTuple_GET_SIZE(args) == 0)
6904 return do_strip(self, LEFTSTRIP); /* Common case */
6905 else
6906 return do_argstrip(self, LEFTSTRIP, args);
6907}
6908
6909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006910PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006911"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006912\n\
6913Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006914If chars is given and not None, remove characters in chars instead.\n\
6915If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006916
6917static PyObject *
6918unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6919{
6920 if (PyTuple_GET_SIZE(args) == 0)
6921 return do_strip(self, RIGHTSTRIP); /* Common case */
6922 else
6923 return do_argstrip(self, RIGHTSTRIP, args);
6924}
6925
6926
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006928unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929{
6930 PyUnicodeObject *u;
6931 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006932 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006933 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934
6935 if (len < 0)
6936 len = 0;
6937
Tim Peters7a29bd52001-09-12 03:03:31 +00006938 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939 /* no repeat, return original string */
6940 Py_INCREF(str);
6941 return (PyObject*) str;
6942 }
Tim Peters8f422462000-09-09 06:13:41 +00006943
6944 /* ensure # of chars needed doesn't overflow int and # of bytes
6945 * needed doesn't overflow size_t
6946 */
6947 nchars = len * str->length;
6948 if (len && nchars / len != str->length) {
6949 PyErr_SetString(PyExc_OverflowError,
6950 "repeated string is too long");
6951 return NULL;
6952 }
6953 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6954 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6955 PyErr_SetString(PyExc_OverflowError,
6956 "repeated string is too long");
6957 return NULL;
6958 }
6959 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960 if (!u)
6961 return NULL;
6962
6963 p = u->str;
6964
Thomas Wouters477c8d52006-05-27 19:21:47 +00006965 if (str->length == 1 && len > 0) {
6966 Py_UNICODE_FILL(p, str->str[0], len);
6967 } else {
6968 Py_ssize_t done = 0; /* number of characters copied this far */
6969 if (done < nchars) {
6970 Py_UNICODE_COPY(p, str->str, str->length);
6971 done = str->length;
6972 }
6973 while (done < nchars) {
6974 int n = (done <= nchars-done) ? done : nchars-done;
6975 Py_UNICODE_COPY(p+done, p, n);
6976 done += n;
6977 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978 }
6979
6980 return (PyObject*) u;
6981}
6982
6983PyObject *PyUnicode_Replace(PyObject *obj,
6984 PyObject *subobj,
6985 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006986 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987{
6988 PyObject *self;
6989 PyObject *str1;
6990 PyObject *str2;
6991 PyObject *result;
6992
6993 self = PyUnicode_FromObject(obj);
6994 if (self == NULL)
6995 return NULL;
6996 str1 = PyUnicode_FromObject(subobj);
6997 if (str1 == NULL) {
6998 Py_DECREF(self);
6999 return NULL;
7000 }
7001 str2 = PyUnicode_FromObject(replobj);
7002 if (str2 == NULL) {
7003 Py_DECREF(self);
7004 Py_DECREF(str1);
7005 return NULL;
7006 }
Tim Petersced69f82003-09-16 20:30:58 +00007007 result = replace((PyUnicodeObject *)self,
7008 (PyUnicodeObject *)str1,
7009 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010 maxcount);
7011 Py_DECREF(self);
7012 Py_DECREF(str1);
7013 Py_DECREF(str2);
7014 return result;
7015}
7016
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007017PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018"S.replace (old, new[, maxsplit]) -> unicode\n\
7019\n\
7020Return a copy of S with all occurrences of substring\n\
7021old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007022given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023
7024static PyObject*
7025unicode_replace(PyUnicodeObject *self, PyObject *args)
7026{
7027 PyUnicodeObject *str1;
7028 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007029 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 PyObject *result;
7031
Martin v. Löwis18e16552006-02-15 17:27:45 +00007032 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033 return NULL;
7034 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7035 if (str1 == NULL)
7036 return NULL;
7037 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007038 if (str2 == NULL) {
7039 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007041 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042
7043 result = replace(self, str1, str2, maxcount);
7044
7045 Py_DECREF(str1);
7046 Py_DECREF(str2);
7047 return result;
7048}
7049
7050static
7051PyObject *unicode_repr(PyObject *unicode)
7052{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007053 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007054 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007055 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7056 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7057
7058 /* XXX(nnorwitz): rather than over-allocating, it would be
7059 better to choose a different scheme. Perhaps scan the
7060 first N-chars of the string and allocate based on that size.
7061 */
7062 /* Initial allocation is based on the longest-possible unichr
7063 escape.
7064
7065 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7066 unichr, so in this case it's the longest unichr escape. In
7067 narrow (UTF-16) builds this is five chars per source unichr
7068 since there are two unichrs in the surrogate pair, so in narrow
7069 (UTF-16) builds it's not the longest unichr escape.
7070
7071 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7072 so in the narrow (UTF-16) build case it's the longest unichr
7073 escape.
7074 */
7075
Walter Dörwald1ab83302007-05-18 17:15:44 +00007076 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007077 2 /* quotes */
7078#ifdef Py_UNICODE_WIDE
7079 + 10*size
7080#else
7081 + 6*size
7082#endif
7083 + 1);
7084 if (repr == NULL)
7085 return NULL;
7086
Walter Dörwald1ab83302007-05-18 17:15:44 +00007087 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007088
7089 /* Add quote */
7090 *p++ = (findchar(s, size, '\'') &&
7091 !findchar(s, size, '"')) ? '"' : '\'';
7092 while (size-- > 0) {
7093 Py_UNICODE ch = *s++;
7094
7095 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007096 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007097 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007098 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007099 continue;
7100 }
7101
7102#ifdef Py_UNICODE_WIDE
7103 /* Map 21-bit characters to '\U00xxxxxx' */
7104 else if (ch >= 0x10000) {
7105 *p++ = '\\';
7106 *p++ = 'U';
7107 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7108 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7109 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7110 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7111 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7112 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7113 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7114 *p++ = hexdigits[ch & 0x0000000F];
7115 continue;
7116 }
7117#else
7118 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7119 else if (ch >= 0xD800 && ch < 0xDC00) {
7120 Py_UNICODE ch2;
7121 Py_UCS4 ucs;
7122
7123 ch2 = *s++;
7124 size--;
7125 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7126 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7127 *p++ = '\\';
7128 *p++ = 'U';
7129 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7130 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7131 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7132 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7133 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7134 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7135 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7136 *p++ = hexdigits[ucs & 0x0000000F];
7137 continue;
7138 }
7139 /* Fall through: isolated surrogates are copied as-is */
7140 s--;
7141 size++;
7142 }
7143#endif
7144
7145 /* Map 16-bit characters to '\uxxxx' */
7146 if (ch >= 256) {
7147 *p++ = '\\';
7148 *p++ = 'u';
7149 *p++ = hexdigits[(ch >> 12) & 0x000F];
7150 *p++ = hexdigits[(ch >> 8) & 0x000F];
7151 *p++ = hexdigits[(ch >> 4) & 0x000F];
7152 *p++ = hexdigits[ch & 0x000F];
7153 }
7154
7155 /* Map special whitespace to '\t', \n', '\r' */
7156 else if (ch == '\t') {
7157 *p++ = '\\';
7158 *p++ = 't';
7159 }
7160 else if (ch == '\n') {
7161 *p++ = '\\';
7162 *p++ = 'n';
7163 }
7164 else if (ch == '\r') {
7165 *p++ = '\\';
7166 *p++ = 'r';
7167 }
7168
7169 /* Map non-printable US ASCII to '\xhh' */
7170 else if (ch < ' ' || ch >= 0x7F) {
7171 *p++ = '\\';
7172 *p++ = 'x';
7173 *p++ = hexdigits[(ch >> 4) & 0x000F];
7174 *p++ = hexdigits[ch & 0x000F];
7175 }
7176
7177 /* Copy everything else as-is */
7178 else
7179 *p++ = (char) ch;
7180 }
7181 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007182 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007183
7184 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007185 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007186 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187}
7188
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007189PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190"S.rfind(sub [,start [,end]]) -> int\n\
7191\n\
7192Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007193such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194arguments start and end are interpreted as in slice notation.\n\
7195\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007196Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197
7198static PyObject *
7199unicode_rfind(PyUnicodeObject *self, PyObject *args)
7200{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007201 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007202 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007203 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007204 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205
Guido van Rossumb8872e62000-05-09 14:14:27 +00007206 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7207 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007209 substring = PyUnicode_FromObject(substring);
7210 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211 return NULL;
7212
Thomas Wouters477c8d52006-05-27 19:21:47 +00007213 result = stringlib_rfind_slice(
7214 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7215 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7216 start, end
7217 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218
7219 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007220
7221 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222}
7223
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007224PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225"S.rindex(sub [,start [,end]]) -> int\n\
7226\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007227Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228
7229static PyObject *
7230unicode_rindex(PyUnicodeObject *self, PyObject *args)
7231{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007232 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007233 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007234 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007235 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236
Guido van Rossumb8872e62000-05-09 14:14:27 +00007237 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7238 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007240 substring = PyUnicode_FromObject(substring);
7241 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242 return NULL;
7243
Thomas Wouters477c8d52006-05-27 19:21:47 +00007244 result = stringlib_rfind_slice(
7245 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7246 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7247 start, end
7248 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249
7250 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007251
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252 if (result < 0) {
7253 PyErr_SetString(PyExc_ValueError, "substring not found");
7254 return NULL;
7255 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007256 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257}
7258
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007259PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007260"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261\n\
7262Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007263done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264
7265static PyObject *
7266unicode_rjust(PyUnicodeObject *self, PyObject *args)
7267{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007268 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007269 Py_UNICODE fillchar = ' ';
7270
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007271 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272 return NULL;
7273
Tim Peters7a29bd52001-09-12 03:03:31 +00007274 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 Py_INCREF(self);
7276 return (PyObject*) self;
7277 }
7278
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007279 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280}
7281
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007283unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284{
7285 /* standard clamping */
7286 if (start < 0)
7287 start = 0;
7288 if (end < 0)
7289 end = 0;
7290 if (end > self->length)
7291 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007292 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293 /* full slice, return original string */
7294 Py_INCREF(self);
7295 return (PyObject*) self;
7296 }
7297 if (start > end)
7298 start = end;
7299 /* copy slice */
7300 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7301 end - start);
7302}
7303
7304PyObject *PyUnicode_Split(PyObject *s,
7305 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007306 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307{
7308 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007309
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310 s = PyUnicode_FromObject(s);
7311 if (s == NULL)
7312 return NULL;
7313 if (sep != NULL) {
7314 sep = PyUnicode_FromObject(sep);
7315 if (sep == NULL) {
7316 Py_DECREF(s);
7317 return NULL;
7318 }
7319 }
7320
7321 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7322
7323 Py_DECREF(s);
7324 Py_XDECREF(sep);
7325 return result;
7326}
7327
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007328PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329"S.split([sep [,maxsplit]]) -> list of strings\n\
7330\n\
7331Return a list of the words in S, using sep as the\n\
7332delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007333splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007334any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335
7336static PyObject*
7337unicode_split(PyUnicodeObject *self, PyObject *args)
7338{
7339 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007340 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341
Martin v. Löwis18e16552006-02-15 17:27:45 +00007342 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343 return NULL;
7344
7345 if (substring == Py_None)
7346 return split(self, NULL, maxcount);
7347 else if (PyUnicode_Check(substring))
7348 return split(self, (PyUnicodeObject *)substring, maxcount);
7349 else
7350 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7351}
7352
Thomas Wouters477c8d52006-05-27 19:21:47 +00007353PyObject *
7354PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7355{
7356 PyObject* str_obj;
7357 PyObject* sep_obj;
7358 PyObject* out;
7359
7360 str_obj = PyUnicode_FromObject(str_in);
7361 if (!str_obj)
7362 return NULL;
7363 sep_obj = PyUnicode_FromObject(sep_in);
7364 if (!sep_obj) {
7365 Py_DECREF(str_obj);
7366 return NULL;
7367 }
7368
7369 out = stringlib_partition(
7370 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7371 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7372 );
7373
7374 Py_DECREF(sep_obj);
7375 Py_DECREF(str_obj);
7376
7377 return out;
7378}
7379
7380
7381PyObject *
7382PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7383{
7384 PyObject* str_obj;
7385 PyObject* sep_obj;
7386 PyObject* out;
7387
7388 str_obj = PyUnicode_FromObject(str_in);
7389 if (!str_obj)
7390 return NULL;
7391 sep_obj = PyUnicode_FromObject(sep_in);
7392 if (!sep_obj) {
7393 Py_DECREF(str_obj);
7394 return NULL;
7395 }
7396
7397 out = stringlib_rpartition(
7398 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7399 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7400 );
7401
7402 Py_DECREF(sep_obj);
7403 Py_DECREF(str_obj);
7404
7405 return out;
7406}
7407
7408PyDoc_STRVAR(partition__doc__,
7409"S.partition(sep) -> (head, sep, tail)\n\
7410\n\
7411Searches for the separator sep in S, and returns the part before it,\n\
7412the separator itself, and the part after it. If the separator is not\n\
7413found, returns S and two empty strings.");
7414
7415static PyObject*
7416unicode_partition(PyUnicodeObject *self, PyObject *separator)
7417{
7418 return PyUnicode_Partition((PyObject *)self, separator);
7419}
7420
7421PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007422"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007423\n\
7424Searches for the separator sep in S, starting at the end of S, and returns\n\
7425the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007426separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007427
7428static PyObject*
7429unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7430{
7431 return PyUnicode_RPartition((PyObject *)self, separator);
7432}
7433
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007434PyObject *PyUnicode_RSplit(PyObject *s,
7435 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007436 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007437{
7438 PyObject *result;
7439
7440 s = PyUnicode_FromObject(s);
7441 if (s == NULL)
7442 return NULL;
7443 if (sep != NULL) {
7444 sep = PyUnicode_FromObject(sep);
7445 if (sep == NULL) {
7446 Py_DECREF(s);
7447 return NULL;
7448 }
7449 }
7450
7451 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7452
7453 Py_DECREF(s);
7454 Py_XDECREF(sep);
7455 return result;
7456}
7457
7458PyDoc_STRVAR(rsplit__doc__,
7459"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7460\n\
7461Return a list of the words in S, using sep as the\n\
7462delimiter string, starting at the end of the string and\n\
7463working to the front. If maxsplit is given, at most maxsplit\n\
7464splits are done. If sep is not specified, any whitespace string\n\
7465is a separator.");
7466
7467static PyObject*
7468unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7469{
7470 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007471 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007472
Martin v. Löwis18e16552006-02-15 17:27:45 +00007473 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007474 return NULL;
7475
7476 if (substring == Py_None)
7477 return rsplit(self, NULL, maxcount);
7478 else if (PyUnicode_Check(substring))
7479 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7480 else
7481 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7482}
7483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007484PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007485"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486\n\
7487Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007488Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007489is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490
7491static PyObject*
7492unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7493{
Guido van Rossum86662912000-04-11 15:38:46 +00007494 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495
Guido van Rossum86662912000-04-11 15:38:46 +00007496 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497 return NULL;
7498
Guido van Rossum86662912000-04-11 15:38:46 +00007499 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500}
7501
7502static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007503PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504{
Walter Dörwald346737f2007-05-31 10:44:43 +00007505 if (PyUnicode_CheckExact(self)) {
7506 Py_INCREF(self);
7507 return self;
7508 } else
7509 /* Subtype -- return genuine unicode string with the same value. */
7510 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7511 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512}
7513
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007514PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515"S.swapcase() -> unicode\n\
7516\n\
7517Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007518and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519
7520static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007521unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523 return fixup(self, fixswapcase);
7524}
7525
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007526PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527"S.translate(table) -> unicode\n\
7528\n\
7529Return a copy of the string S, where all characters have been mapped\n\
7530through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007531Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7532Unmapped characters are left untouched. Characters mapped to None\n\
7533are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534
7535static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007536unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537{
Tim Petersced69f82003-09-16 20:30:58 +00007538 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007540 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541 "ignore");
7542}
7543
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007544PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545"S.upper() -> unicode\n\
7546\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007547Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548
7549static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007550unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 return fixup(self, fixupper);
7553}
7554
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007555PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556"S.zfill(width) -> unicode\n\
7557\n\
7558Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007559of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560
7561static PyObject *
7562unicode_zfill(PyUnicodeObject *self, PyObject *args)
7563{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007564 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565 PyUnicodeObject *u;
7566
Martin v. Löwis18e16552006-02-15 17:27:45 +00007567 Py_ssize_t width;
7568 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569 return NULL;
7570
7571 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007572 if (PyUnicode_CheckExact(self)) {
7573 Py_INCREF(self);
7574 return (PyObject*) self;
7575 }
7576 else
7577 return PyUnicode_FromUnicode(
7578 PyUnicode_AS_UNICODE(self),
7579 PyUnicode_GET_SIZE(self)
7580 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581 }
7582
7583 fill = width - self->length;
7584
7585 u = pad(self, fill, 0, '0');
7586
Walter Dörwald068325e2002-04-15 13:36:47 +00007587 if (u == NULL)
7588 return NULL;
7589
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 if (u->str[fill] == '+' || u->str[fill] == '-') {
7591 /* move sign to beginning of string */
7592 u->str[0] = u->str[fill];
7593 u->str[fill] = '0';
7594 }
7595
7596 return (PyObject*) u;
7597}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598
7599#if 0
7600static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007601unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603 return PyInt_FromLong(unicode_freelist_size);
7604}
7605#endif
7606
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007607PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007608"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007610Return True if S starts with the specified prefix, False otherwise.\n\
7611With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007612With optional end, stop comparing S at that position.\n\
7613prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614
7615static PyObject *
7616unicode_startswith(PyUnicodeObject *self,
7617 PyObject *args)
7618{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007619 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007621 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007622 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007623 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007625 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007626 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007628 if (PyTuple_Check(subobj)) {
7629 Py_ssize_t i;
7630 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7631 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7632 PyTuple_GET_ITEM(subobj, i));
7633 if (substring == NULL)
7634 return NULL;
7635 result = tailmatch(self, substring, start, end, -1);
7636 Py_DECREF(substring);
7637 if (result) {
7638 Py_RETURN_TRUE;
7639 }
7640 }
7641 /* nothing matched */
7642 Py_RETURN_FALSE;
7643 }
7644 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007646 return NULL;
7647 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007649 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650}
7651
7652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007653PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007654"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007656Return True if S ends with the specified suffix, False otherwise.\n\
7657With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007658With optional end, stop comparing S at that position.\n\
7659suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660
7661static PyObject *
7662unicode_endswith(PyUnicodeObject *self,
7663 PyObject *args)
7664{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007665 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007667 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007668 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007669 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007671 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7672 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007674 if (PyTuple_Check(subobj)) {
7675 Py_ssize_t i;
7676 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7677 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7678 PyTuple_GET_ITEM(subobj, i));
7679 if (substring == NULL)
7680 return NULL;
7681 result = tailmatch(self, substring, start, end, +1);
7682 Py_DECREF(substring);
7683 if (result) {
7684 Py_RETURN_TRUE;
7685 }
7686 }
7687 Py_RETURN_FALSE;
7688 }
7689 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007691 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007693 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007695 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696}
7697
7698
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007699
7700static PyObject *
7701unicode_getnewargs(PyUnicodeObject *v)
7702{
7703 return Py_BuildValue("(u#)", v->str, v->length);
7704}
7705
7706
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707static PyMethodDef unicode_methods[] = {
7708
7709 /* Order is according to common usage: often used methods should
7710 appear first, since lookup is done sequentially. */
7711
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007712 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7713 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7714 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007715 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007716 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7717 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7718 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7719 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7720 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7721 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7722 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007723 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007724 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7725 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7726 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007727 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007728 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007729/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7730 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7731 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7732 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007733 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007734 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007735 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007736 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007737 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7738 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7739 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7740 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7741 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7742 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7743 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7744 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7745 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7746 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7747 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7748 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7749 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7750 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00007751 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007752 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007753#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007754 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755#endif
7756
7757#if 0
7758 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007759 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760#endif
7761
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007762 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 {NULL, NULL}
7764};
7765
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007766static PyObject *
7767unicode_mod(PyObject *v, PyObject *w)
7768{
7769 if (!PyUnicode_Check(v)) {
7770 Py_INCREF(Py_NotImplemented);
7771 return Py_NotImplemented;
7772 }
7773 return PyUnicode_Format(v, w);
7774}
7775
7776static PyNumberMethods unicode_as_number = {
7777 0, /*nb_add*/
7778 0, /*nb_subtract*/
7779 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007780 unicode_mod, /*nb_remainder*/
7781};
7782
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007784 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007785 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007786 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7787 (ssizeargfunc) unicode_getitem, /* sq_item */
7788 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789 0, /* sq_ass_item */
7790 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007791 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792};
7793
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007794static PyObject*
7795unicode_subscript(PyUnicodeObject* self, PyObject* item)
7796{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007797 if (PyIndex_Check(item)) {
7798 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007799 if (i == -1 && PyErr_Occurred())
7800 return NULL;
7801 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007802 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007803 return unicode_getitem(self, i);
7804 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007805 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007806 Py_UNICODE* source_buf;
7807 Py_UNICODE* result_buf;
7808 PyObject* result;
7809
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007810 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007811 &start, &stop, &step, &slicelength) < 0) {
7812 return NULL;
7813 }
7814
7815 if (slicelength <= 0) {
7816 return PyUnicode_FromUnicode(NULL, 0);
7817 } else {
7818 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007819 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7820 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007821
7822 if (result_buf == NULL)
7823 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007824
7825 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7826 result_buf[i] = source_buf[cur];
7827 }
Tim Petersced69f82003-09-16 20:30:58 +00007828
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007829 result = PyUnicode_FromUnicode(result_buf, slicelength);
7830 PyMem_FREE(result_buf);
7831 return result;
7832 }
7833 } else {
7834 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7835 return NULL;
7836 }
7837}
7838
7839static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007840 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007841 (binaryfunc)unicode_subscript, /* mp_subscript */
7842 (objobjargproc)0, /* mp_ass_subscript */
7843};
7844
Martin v. Löwis18e16552006-02-15 17:27:45 +00007845static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007847 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848 const void **ptr)
7849{
7850 if (index != 0) {
7851 PyErr_SetString(PyExc_SystemError,
7852 "accessing non-existent unicode segment");
7853 return -1;
7854 }
7855 *ptr = (void *) self->str;
7856 return PyUnicode_GET_DATA_SIZE(self);
7857}
7858
Martin v. Löwis18e16552006-02-15 17:27:45 +00007859static Py_ssize_t
7860unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861 const void **ptr)
7862{
7863 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007864 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865 return -1;
7866}
7867
7868static int
7869unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007870 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871{
7872 if (lenp)
7873 *lenp = PyUnicode_GET_DATA_SIZE(self);
7874 return 1;
7875}
7876
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007877static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007879 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880 const void **ptr)
7881{
7882 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007883
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884 if (index != 0) {
7885 PyErr_SetString(PyExc_SystemError,
7886 "accessing non-existent unicode segment");
7887 return -1;
7888 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007889 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890 if (str == NULL)
7891 return -1;
7892 *ptr = (void *) PyString_AS_STRING(str);
7893 return PyString_GET_SIZE(str);
7894}
7895
7896/* Helpers for PyUnicode_Format() */
7897
7898static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007899getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007901 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902 if (argidx < arglen) {
7903 (*p_argidx)++;
7904 if (arglen < 0)
7905 return args;
7906 else
7907 return PyTuple_GetItem(args, argidx);
7908 }
7909 PyErr_SetString(PyExc_TypeError,
7910 "not enough arguments for format string");
7911 return NULL;
7912}
7913
7914#define F_LJUST (1<<0)
7915#define F_SIGN (1<<1)
7916#define F_BLANK (1<<2)
7917#define F_ALT (1<<3)
7918#define F_ZERO (1<<4)
7919
Martin v. Löwis18e16552006-02-15 17:27:45 +00007920static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007921strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007922{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007923 register Py_ssize_t i;
7924 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007925 for (i = len - 1; i >= 0; i--)
7926 buffer[i] = (Py_UNICODE) charbuffer[i];
7927
Guido van Rossumd57fd912000-03-10 22:53:23 +00007928 return len;
7929}
7930
Neal Norwitzfc76d632006-01-10 06:03:13 +00007931static int
7932doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7933{
Tim Peters15231542006-02-16 01:08:01 +00007934 Py_ssize_t result;
7935
Neal Norwitzfc76d632006-01-10 06:03:13 +00007936 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007937 result = strtounicode(buffer, (char *)buffer);
7938 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007939}
7940
7941static int
7942longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7943{
Tim Peters15231542006-02-16 01:08:01 +00007944 Py_ssize_t result;
7945
Neal Norwitzfc76d632006-01-10 06:03:13 +00007946 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007947 result = strtounicode(buffer, (char *)buffer);
7948 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007949}
7950
Guido van Rossum078151d2002-08-11 04:24:12 +00007951/* XXX To save some code duplication, formatfloat/long/int could have been
7952 shared with stringobject.c, converting from 8-bit to Unicode after the
7953 formatting is done. */
7954
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955static int
7956formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007957 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958 int flags,
7959 int prec,
7960 int type,
7961 PyObject *v)
7962{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007963 /* fmt = '%#.' + `prec` + `type`
7964 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965 char fmt[20];
7966 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007967
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968 x = PyFloat_AsDouble(v);
7969 if (x == -1.0 && PyErr_Occurred())
7970 return -1;
7971 if (prec < 0)
7972 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7974 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007975 /* Worst case length calc to ensure no buffer overrun:
7976
7977 'g' formats:
7978 fmt = %#.<prec>g
7979 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7980 for any double rep.)
7981 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7982
7983 'f' formats:
7984 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7985 len = 1 + 50 + 1 + prec = 52 + prec
7986
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007987 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007988 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007989
7990 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00007991 if (((type == 'g' || type == 'G') &&
7992 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007993 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007994 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007995 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007996 return -1;
7997 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007998 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7999 (flags&F_ALT) ? "#" : "",
8000 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008001 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002}
8003
Tim Peters38fd5b62000-09-21 05:43:11 +00008004static PyObject*
8005formatlong(PyObject *val, int flags, int prec, int type)
8006{
8007 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008008 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008009 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008010 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008011
8012 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8013 if (!str)
8014 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008015 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008016 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008017 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008018}
8019
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020static int
8021formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008022 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023 int flags,
8024 int prec,
8025 int type,
8026 PyObject *v)
8027{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008028 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008029 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8030 * + 1 + 1
8031 * = 24
8032 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008033 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008034 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035 long x;
8036
8037 x = PyInt_AsLong(v);
8038 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008039 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008040 if (x < 0 && type == 'u') {
8041 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008042 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008043 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8044 sign = "-";
8045 else
8046 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008048 prec = 1;
8049
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008050 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8051 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008052 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008053 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008054 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008055 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008056 return -1;
8057 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008058
8059 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008060 (type == 'x' || type == 'X' || type == 'o')) {
8061 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008062 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008063 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008064 * - when 0 is being converted, the C standard leaves off
8065 * the '0x' or '0X', which is inconsistent with other
8066 * %#x/%#X conversions and inconsistent with Python's
8067 * hex() function
8068 * - there are platforms that violate the standard and
8069 * convert 0 with the '0x' or '0X'
8070 * (Metrowerks, Compaq Tru64)
8071 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008072 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008073 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008074 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008075 * We can achieve the desired consistency by inserting our
8076 * own '0x' or '0X' prefix, and substituting %x/%X in place
8077 * of %#x/%#X.
8078 *
8079 * Note that this is the same approach as used in
8080 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008081 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008082 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8083 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008084 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008085 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008086 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8087 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008088 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008089 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008090 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008091 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008092 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008093 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094}
8095
8096static int
8097formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008098 size_t buflen,
8099 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008101 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008102 if (PyUnicode_Check(v)) {
8103 if (PyUnicode_GET_SIZE(v) != 1)
8104 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008108 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008109 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008110 goto onError;
8111 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8112 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113
8114 else {
8115 /* Integer input truncated to a character */
8116 long x;
8117 x = PyInt_AsLong(v);
8118 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008119 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008120#ifdef Py_UNICODE_WIDE
8121 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008122 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008123 "%c arg not in range(0x110000) "
8124 "(wide Python build)");
8125 return -1;
8126 }
8127#else
8128 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008129 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008130 "%c arg not in range(0x10000) "
8131 "(narrow Python build)");
8132 return -1;
8133 }
8134#endif
8135 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136 }
8137 buf[1] = '\0';
8138 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008139
8140 onError:
8141 PyErr_SetString(PyExc_TypeError,
8142 "%c requires int or char");
8143 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144}
8145
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008146/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8147
8148 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8149 chars are formatted. XXX This is a magic number. Each formatting
8150 routine does bounds checking to ensure no overflow, but a better
8151 solution may be to malloc a buffer of appropriate size for each
8152 format. For now, the current solution is sufficient.
8153*/
8154#define FORMATBUFLEN (size_t)120
8155
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156PyObject *PyUnicode_Format(PyObject *format,
8157 PyObject *args)
8158{
8159 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008160 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161 int args_owned = 0;
8162 PyUnicodeObject *result = NULL;
8163 PyObject *dict = NULL;
8164 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008165
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166 if (format == NULL || args == NULL) {
8167 PyErr_BadInternalCall();
8168 return NULL;
8169 }
8170 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008171 if (uformat == NULL)
8172 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173 fmt = PyUnicode_AS_UNICODE(uformat);
8174 fmtcnt = PyUnicode_GET_SIZE(uformat);
8175
8176 reslen = rescnt = fmtcnt + 100;
8177 result = _PyUnicode_New(reslen);
8178 if (result == NULL)
8179 goto onError;
8180 res = PyUnicode_AS_UNICODE(result);
8181
8182 if (PyTuple_Check(args)) {
8183 arglen = PyTuple_Size(args);
8184 argidx = 0;
8185 }
8186 else {
8187 arglen = -1;
8188 argidx = -2;
8189 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008190 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008191 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192 dict = args;
8193
8194 while (--fmtcnt >= 0) {
8195 if (*fmt != '%') {
8196 if (--rescnt < 0) {
8197 rescnt = fmtcnt + 100;
8198 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008199 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8202 --rescnt;
8203 }
8204 *res++ = *fmt++;
8205 }
8206 else {
8207 /* Got a format specifier */
8208 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008209 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008210 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211 Py_UNICODE c = '\0';
8212 Py_UNICODE fill;
8213 PyObject *v = NULL;
8214 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008215 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008217 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008218 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219
8220 fmt++;
8221 if (*fmt == '(') {
8222 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008223 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 PyObject *key;
8225 int pcount = 1;
8226
8227 if (dict == NULL) {
8228 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008229 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230 goto onError;
8231 }
8232 ++fmt;
8233 --fmtcnt;
8234 keystart = fmt;
8235 /* Skip over balanced parentheses */
8236 while (pcount > 0 && --fmtcnt >= 0) {
8237 if (*fmt == ')')
8238 --pcount;
8239 else if (*fmt == '(')
8240 ++pcount;
8241 fmt++;
8242 }
8243 keylen = fmt - keystart - 1;
8244 if (fmtcnt < 0 || pcount > 0) {
8245 PyErr_SetString(PyExc_ValueError,
8246 "incomplete format key");
8247 goto onError;
8248 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008249#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008250 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251 then looked up since Python uses strings to hold
8252 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008253 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254 key = PyUnicode_EncodeUTF8(keystart,
8255 keylen,
8256 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008257#else
8258 key = PyUnicode_FromUnicode(keystart, keylen);
8259#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 if (key == NULL)
8261 goto onError;
8262 if (args_owned) {
8263 Py_DECREF(args);
8264 args_owned = 0;
8265 }
8266 args = PyObject_GetItem(dict, key);
8267 Py_DECREF(key);
8268 if (args == NULL) {
8269 goto onError;
8270 }
8271 args_owned = 1;
8272 arglen = -1;
8273 argidx = -2;
8274 }
8275 while (--fmtcnt >= 0) {
8276 switch (c = *fmt++) {
8277 case '-': flags |= F_LJUST; continue;
8278 case '+': flags |= F_SIGN; continue;
8279 case ' ': flags |= F_BLANK; continue;
8280 case '#': flags |= F_ALT; continue;
8281 case '0': flags |= F_ZERO; continue;
8282 }
8283 break;
8284 }
8285 if (c == '*') {
8286 v = getnextarg(args, arglen, &argidx);
8287 if (v == NULL)
8288 goto onError;
8289 if (!PyInt_Check(v)) {
8290 PyErr_SetString(PyExc_TypeError,
8291 "* wants int");
8292 goto onError;
8293 }
8294 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008295 if (width == -1 && PyErr_Occurred())
8296 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297 if (width < 0) {
8298 flags |= F_LJUST;
8299 width = -width;
8300 }
8301 if (--fmtcnt >= 0)
8302 c = *fmt++;
8303 }
8304 else if (c >= '0' && c <= '9') {
8305 width = c - '0';
8306 while (--fmtcnt >= 0) {
8307 c = *fmt++;
8308 if (c < '0' || c > '9')
8309 break;
8310 if ((width*10) / 10 != width) {
8311 PyErr_SetString(PyExc_ValueError,
8312 "width too big");
8313 goto onError;
8314 }
8315 width = width*10 + (c - '0');
8316 }
8317 }
8318 if (c == '.') {
8319 prec = 0;
8320 if (--fmtcnt >= 0)
8321 c = *fmt++;
8322 if (c == '*') {
8323 v = getnextarg(args, arglen, &argidx);
8324 if (v == NULL)
8325 goto onError;
8326 if (!PyInt_Check(v)) {
8327 PyErr_SetString(PyExc_TypeError,
8328 "* wants int");
8329 goto onError;
8330 }
8331 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008332 if (prec == -1 && PyErr_Occurred())
8333 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334 if (prec < 0)
8335 prec = 0;
8336 if (--fmtcnt >= 0)
8337 c = *fmt++;
8338 }
8339 else if (c >= '0' && c <= '9') {
8340 prec = c - '0';
8341 while (--fmtcnt >= 0) {
8342 c = Py_CHARMASK(*fmt++);
8343 if (c < '0' || c > '9')
8344 break;
8345 if ((prec*10) / 10 != prec) {
8346 PyErr_SetString(PyExc_ValueError,
8347 "prec too big");
8348 goto onError;
8349 }
8350 prec = prec*10 + (c - '0');
8351 }
8352 }
8353 } /* prec */
8354 if (fmtcnt >= 0) {
8355 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356 if (--fmtcnt >= 0)
8357 c = *fmt++;
8358 }
8359 }
8360 if (fmtcnt < 0) {
8361 PyErr_SetString(PyExc_ValueError,
8362 "incomplete format");
8363 goto onError;
8364 }
8365 if (c != '%') {
8366 v = getnextarg(args, arglen, &argidx);
8367 if (v == NULL)
8368 goto onError;
8369 }
8370 sign = 0;
8371 fill = ' ';
8372 switch (c) {
8373
8374 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008375 pbuf = formatbuf;
8376 /* presume that buffer length is at least 1 */
8377 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 len = 1;
8379 break;
8380
8381 case 's':
8382 case 'r':
8383 if (PyUnicode_Check(v) && c == 's') {
8384 temp = v;
8385 Py_INCREF(temp);
8386 }
8387 else {
8388 PyObject *unicode;
8389 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008390 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391 else
8392 temp = PyObject_Repr(v);
8393 if (temp == NULL)
8394 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008395 if (PyUnicode_Check(temp))
8396 /* nothing to do */;
8397 else if (PyString_Check(temp)) {
8398 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008399 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008401 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008402 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008403 Py_DECREF(temp);
8404 temp = unicode;
8405 if (temp == NULL)
8406 goto onError;
8407 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008408 else {
8409 Py_DECREF(temp);
8410 PyErr_SetString(PyExc_TypeError,
8411 "%s argument has non-string str()");
8412 goto onError;
8413 }
8414 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008415 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008416 len = PyUnicode_GET_SIZE(temp);
8417 if (prec >= 0 && len > prec)
8418 len = prec;
8419 break;
8420
8421 case 'i':
8422 case 'd':
8423 case 'u':
8424 case 'o':
8425 case 'x':
8426 case 'X':
8427 if (c == 'i')
8428 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008429 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008430 temp = formatlong(v, flags, prec, c);
8431 if (!temp)
8432 goto onError;
8433 pbuf = PyUnicode_AS_UNICODE(temp);
8434 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008435 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008437 else {
8438 pbuf = formatbuf;
8439 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8440 flags, prec, c, v);
8441 if (len < 0)
8442 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008443 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008444 }
8445 if (flags & F_ZERO)
8446 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447 break;
8448
8449 case 'e':
8450 case 'E':
8451 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008452 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453 case 'g':
8454 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008455 if (c == 'F')
8456 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008457 pbuf = formatbuf;
8458 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8459 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460 if (len < 0)
8461 goto onError;
8462 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008463 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464 fill = '0';
8465 break;
8466
8467 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008468 pbuf = formatbuf;
8469 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470 if (len < 0)
8471 goto onError;
8472 break;
8473
8474 default:
8475 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008476 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008477 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008478 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008479 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008480 (Py_ssize_t)(fmt - 1 -
8481 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482 goto onError;
8483 }
8484 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008485 if (*pbuf == '-' || *pbuf == '+') {
8486 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487 len--;
8488 }
8489 else if (flags & F_SIGN)
8490 sign = '+';
8491 else if (flags & F_BLANK)
8492 sign = ' ';
8493 else
8494 sign = 0;
8495 }
8496 if (width < len)
8497 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008498 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499 reslen -= rescnt;
8500 rescnt = width + fmtcnt + 100;
8501 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008502 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008503 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008504 PyErr_NoMemory();
8505 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008506 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008507 if (_PyUnicode_Resize(&result, reslen) < 0) {
8508 Py_XDECREF(temp);
8509 goto onError;
8510 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511 res = PyUnicode_AS_UNICODE(result)
8512 + reslen - rescnt;
8513 }
8514 if (sign) {
8515 if (fill != ' ')
8516 *res++ = sign;
8517 rescnt--;
8518 if (width > len)
8519 width--;
8520 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008521 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008522 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008523 assert(pbuf[1] == c);
8524 if (fill != ' ') {
8525 *res++ = *pbuf++;
8526 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008527 }
Tim Petersfff53252001-04-12 18:38:48 +00008528 rescnt -= 2;
8529 width -= 2;
8530 if (width < 0)
8531 width = 0;
8532 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534 if (width > len && !(flags & F_LJUST)) {
8535 do {
8536 --rescnt;
8537 *res++ = fill;
8538 } while (--width > len);
8539 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008540 if (fill == ' ') {
8541 if (sign)
8542 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008543 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008544 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008545 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008546 *res++ = *pbuf++;
8547 *res++ = *pbuf++;
8548 }
8549 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008550 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551 res += len;
8552 rescnt -= len;
8553 while (--width >= len) {
8554 --rescnt;
8555 *res++ = ' ';
8556 }
8557 if (dict && (argidx < arglen) && c != '%') {
8558 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008559 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008560 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561 goto onError;
8562 }
8563 Py_XDECREF(temp);
8564 } /* '%' */
8565 } /* until end */
8566 if (argidx < arglen && !dict) {
8567 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008568 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569 goto onError;
8570 }
8571
Thomas Woutersa96affe2006-03-12 00:29:36 +00008572 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8573 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574 if (args_owned) {
8575 Py_DECREF(args);
8576 }
8577 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 return (PyObject *)result;
8579
8580 onError:
8581 Py_XDECREF(result);
8582 Py_DECREF(uformat);
8583 if (args_owned) {
8584 Py_DECREF(args);
8585 }
8586 return NULL;
8587}
8588
8589static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008590 (readbufferproc) unicode_buffer_getreadbuf,
8591 (writebufferproc) unicode_buffer_getwritebuf,
8592 (segcountproc) unicode_buffer_getsegcount,
8593 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594};
8595
Jeremy Hylton938ace62002-07-17 16:30:39 +00008596static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008597unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8598
Tim Peters6d6c1a32001-08-02 04:15:00 +00008599static PyObject *
8600unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8601{
8602 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008603 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008604 char *encoding = NULL;
8605 char *errors = NULL;
8606
Guido van Rossume023fe02001-08-30 03:12:59 +00008607 if (type != &PyUnicode_Type)
8608 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008609 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8610 kwlist, &x, &encoding, &errors))
8611 return NULL;
8612 if (x == NULL)
8613 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008614 if (encoding == NULL && errors == NULL)
8615 return PyObject_Unicode(x);
8616 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008617 return PyUnicode_FromEncodedObject(x, encoding, errors);
8618}
8619
Guido van Rossume023fe02001-08-30 03:12:59 +00008620static PyObject *
8621unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8622{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008623 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008624 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008625
8626 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8627 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8628 if (tmp == NULL)
8629 return NULL;
8630 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008631 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008632 if (pnew == NULL) {
8633 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008634 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008635 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008636 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8637 if (pnew->str == NULL) {
8638 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008639 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008640 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008641 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008642 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008643 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8644 pnew->length = n;
8645 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008646 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008647 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008648}
8649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008650PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008651"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008652\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008653Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008654encoding defaults to the current default string encoding.\n\
8655errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008656
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008657static PyObject *unicode_iter(PyObject *seq);
8658
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008660 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008661 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662 sizeof(PyUnicodeObject), /* tp_size */
8663 0, /* tp_itemsize */
8664 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008665 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008667 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008669 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008670 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008671 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008673 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674 (hashfunc) unicode_hash, /* tp_hash*/
8675 0, /* tp_call*/
8676 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008677 PyObject_GenericGetAttr, /* tp_getattro */
8678 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008680 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8681 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008682 unicode_doc, /* tp_doc */
8683 0, /* tp_traverse */
8684 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008685 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008686 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008687 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008688 0, /* tp_iternext */
8689 unicode_methods, /* tp_methods */
8690 0, /* tp_members */
8691 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008692 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008693 0, /* tp_dict */
8694 0, /* tp_descr_get */
8695 0, /* tp_descr_set */
8696 0, /* tp_dictoffset */
8697 0, /* tp_init */
8698 0, /* tp_alloc */
8699 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008700 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701};
8702
8703/* Initialize the Unicode implementation */
8704
Thomas Wouters78890102000-07-22 19:25:51 +00008705void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008706{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008707 int i;
8708
Thomas Wouters477c8d52006-05-27 19:21:47 +00008709 /* XXX - move this array to unicodectype.c ? */
8710 Py_UNICODE linebreak[] = {
8711 0x000A, /* LINE FEED */
8712 0x000D, /* CARRIAGE RETURN */
8713 0x001C, /* FILE SEPARATOR */
8714 0x001D, /* GROUP SEPARATOR */
8715 0x001E, /* RECORD SEPARATOR */
8716 0x0085, /* NEXT LINE */
8717 0x2028, /* LINE SEPARATOR */
8718 0x2029, /* PARAGRAPH SEPARATOR */
8719 };
8720
Fred Drakee4315f52000-05-09 19:53:39 +00008721 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008722 unicode_freelist = NULL;
8723 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008725 if (!unicode_empty)
8726 return;
8727
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008728 for (i = 0; i < 256; i++)
8729 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008730 if (PyType_Ready(&PyUnicode_Type) < 0)
8731 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008732
8733 /* initialize the linebreak bloom filter */
8734 bloom_linebreak = make_bloom_mask(
8735 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8736 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008737
8738 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739}
8740
8741/* Finalize the Unicode implementation */
8742
8743void
Thomas Wouters78890102000-07-22 19:25:51 +00008744_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008745{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008746 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008747 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008749 Py_XDECREF(unicode_empty);
8750 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008751
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008752 for (i = 0; i < 256; i++) {
8753 if (unicode_latin1[i]) {
8754 Py_DECREF(unicode_latin1[i]);
8755 unicode_latin1[i] = NULL;
8756 }
8757 }
8758
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008759 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760 PyUnicodeObject *v = u;
8761 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008762 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008763 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008764 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008765 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008766 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008767 unicode_freelist = NULL;
8768 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008770
Walter Dörwald16807132007-05-25 13:52:07 +00008771void
8772PyUnicode_InternInPlace(PyObject **p)
8773{
8774 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8775 PyObject *t;
8776 if (s == NULL || !PyUnicode_Check(s))
8777 Py_FatalError(
8778 "PyUnicode_InternInPlace: unicode strings only please!");
8779 /* If it's a subclass, we don't really know what putting
8780 it in the interned dict might do. */
8781 if (!PyUnicode_CheckExact(s))
8782 return;
8783 if (PyUnicode_CHECK_INTERNED(s))
8784 return;
8785 if (interned == NULL) {
8786 interned = PyDict_New();
8787 if (interned == NULL) {
8788 PyErr_Clear(); /* Don't leave an exception */
8789 return;
8790 }
8791 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008792 /* It might be that the GetItem call fails even
8793 though the key is present in the dictionary,
8794 namely when this happens during a stack overflow. */
8795 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00008796 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00008797 Py_END_ALLOW_RECURSION
8798
Walter Dörwald16807132007-05-25 13:52:07 +00008799 if (t) {
8800 Py_INCREF(t);
8801 Py_DECREF(*p);
8802 *p = t;
8803 return;
8804 }
8805
Martin v. Löwis5b222132007-06-10 09:51:05 +00008806 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008807 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
8808 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00008809 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008810 return;
8811 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008812 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008813 /* The two references in interned are not counted by refcnt.
8814 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008815 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00008816 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
8817}
8818
8819void
8820PyUnicode_InternImmortal(PyObject **p)
8821{
8822 PyUnicode_InternInPlace(p);
8823 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
8824 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
8825 Py_INCREF(*p);
8826 }
8827}
8828
8829PyObject *
8830PyUnicode_InternFromString(const char *cp)
8831{
8832 PyObject *s = PyUnicode_FromString(cp);
8833 if (s == NULL)
8834 return NULL;
8835 PyUnicode_InternInPlace(&s);
8836 return s;
8837}
8838
8839void _Py_ReleaseInternedUnicodeStrings(void)
8840{
8841 PyObject *keys;
8842 PyUnicodeObject *s;
8843 Py_ssize_t i, n;
8844 Py_ssize_t immortal_size = 0, mortal_size = 0;
8845
8846 if (interned == NULL || !PyDict_Check(interned))
8847 return;
8848 keys = PyDict_Keys(interned);
8849 if (keys == NULL || !PyList_Check(keys)) {
8850 PyErr_Clear();
8851 return;
8852 }
8853
8854 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
8855 detector, interned unicode strings are not forcibly deallocated;
8856 rather, we give them their stolen references back, and then clear
8857 and DECREF the interned dict. */
8858
8859 n = PyList_GET_SIZE(keys);
8860 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
8861 n);
8862 for (i = 0; i < n; i++) {
8863 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
8864 switch (s->state) {
8865 case SSTATE_NOT_INTERNED:
8866 /* XXX Shouldn't happen */
8867 break;
8868 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008869 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008870 immortal_size += s->length;
8871 break;
8872 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008873 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00008874 mortal_size += s->length;
8875 break;
8876 default:
8877 Py_FatalError("Inconsistent interned string state.");
8878 }
8879 s->state = SSTATE_NOT_INTERNED;
8880 }
8881 fprintf(stderr, "total size of all interned strings: "
8882 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
8883 "mortal/immortal\n", mortal_size, immortal_size);
8884 Py_DECREF(keys);
8885 PyDict_Clear(interned);
8886 Py_DECREF(interned);
8887 interned = NULL;
8888}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008889
8890
8891/********************* Unicode Iterator **************************/
8892
8893typedef struct {
8894 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008895 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008896 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8897} unicodeiterobject;
8898
8899static void
8900unicodeiter_dealloc(unicodeiterobject *it)
8901{
8902 _PyObject_GC_UNTRACK(it);
8903 Py_XDECREF(it->it_seq);
8904 PyObject_GC_Del(it);
8905}
8906
8907static int
8908unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8909{
8910 Py_VISIT(it->it_seq);
8911 return 0;
8912}
8913
8914static PyObject *
8915unicodeiter_next(unicodeiterobject *it)
8916{
8917 PyUnicodeObject *seq;
8918 PyObject *item;
8919
8920 assert(it != NULL);
8921 seq = it->it_seq;
8922 if (seq == NULL)
8923 return NULL;
8924 assert(PyUnicode_Check(seq));
8925
8926 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008927 item = PyUnicode_FromUnicode(
8928 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008929 if (item != NULL)
8930 ++it->it_index;
8931 return item;
8932 }
8933
8934 Py_DECREF(seq);
8935 it->it_seq = NULL;
8936 return NULL;
8937}
8938
8939static PyObject *
8940unicodeiter_len(unicodeiterobject *it)
8941{
8942 Py_ssize_t len = 0;
8943 if (it->it_seq)
8944 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8945 return PyInt_FromSsize_t(len);
8946}
8947
8948PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8949
8950static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008951 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8952 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008953 {NULL, NULL} /* sentinel */
8954};
8955
8956PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008957 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008958 "unicodeiterator", /* tp_name */
8959 sizeof(unicodeiterobject), /* tp_basicsize */
8960 0, /* tp_itemsize */
8961 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008962 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008963 0, /* tp_print */
8964 0, /* tp_getattr */
8965 0, /* tp_setattr */
8966 0, /* tp_compare */
8967 0, /* tp_repr */
8968 0, /* tp_as_number */
8969 0, /* tp_as_sequence */
8970 0, /* tp_as_mapping */
8971 0, /* tp_hash */
8972 0, /* tp_call */
8973 0, /* tp_str */
8974 PyObject_GenericGetAttr, /* tp_getattro */
8975 0, /* tp_setattro */
8976 0, /* tp_as_buffer */
8977 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8978 0, /* tp_doc */
8979 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8980 0, /* tp_clear */
8981 0, /* tp_richcompare */
8982 0, /* tp_weaklistoffset */
8983 PyObject_SelfIter, /* tp_iter */
8984 (iternextfunc)unicodeiter_next, /* tp_iternext */
8985 unicodeiter_methods, /* tp_methods */
8986 0,
8987};
8988
8989static PyObject *
8990unicode_iter(PyObject *seq)
8991{
8992 unicodeiterobject *it;
8993
8994 if (!PyUnicode_Check(seq)) {
8995 PyErr_BadInternalCall();
8996 return NULL;
8997 }
8998 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8999 if (it == NULL)
9000 return NULL;
9001 it->it_index = 0;
9002 Py_INCREF(seq);
9003 it->it_seq = (PyUnicodeObject *)seq;
9004 _PyObject_GC_TRACK(it);
9005 return (PyObject *)it;
9006}
9007
Martin v. Löwis5b222132007-06-10 09:51:05 +00009008size_t
9009Py_UNICODE_strlen(const Py_UNICODE *u)
9010{
9011 int res = 0;
9012 while(*u++)
9013 res++;
9014 return res;
9015}
9016
9017Py_UNICODE*
9018Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9019{
9020 Py_UNICODE *u = s1;
9021 while ((*u++ = *s2++));
9022 return s1;
9023}
9024
9025Py_UNICODE*
9026Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9027{
9028 Py_UNICODE *u = s1;
9029 while ((*u++ = *s2++))
9030 if (n-- == 0)
9031 break;
9032 return s1;
9033}
9034
9035int
9036Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9037{
9038 while (*s1 && *s2 && *s1 == *s2)
9039 s1++, s2++;
9040 if (*s1 && *s2)
9041 return (*s1 < *s2) ? -1 : +1;
9042 if (*s1)
9043 return 1;
9044 if (*s2)
9045 return -1;
9046 return 0;
9047}
9048
9049Py_UNICODE*
9050Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9051{
9052 const Py_UNICODE *p;
9053 for (p = s; *p; p++)
9054 if (*p == c)
9055 return (Py_UNICODE*)p;
9056 return NULL;
9057}
9058
9059
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009060#ifdef __cplusplus
9061}
9062#endif
9063
9064
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009065/*
9066Local variables:
9067c-basic-offset: 4
9068indent-tabs-mode: nil
9069End:
9070*/