blob: e9f97df8d5d38d2439021bd6a3fe861fe6979ca1 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
101 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000106static PyUnicodeObject *unicode_freelist;
107static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Fred Drakee4315f52000-05-09 19:53:39 +0000116/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000117 parameter; it is fixed to "utf-8". Always use the
118 PyUnicode_GetDefaultEncoding() API to access this global. */
119static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000120
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000121Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000122PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000124#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125 return 0x10FFFF;
126#else
127 /* This is actually an illegal character, so it should
128 not be passed to unichr. */
129 return 0xFFFF;
130#endif
131}
132
Thomas Wouters477c8d52006-05-27 19:21:47 +0000133/* --- Bloom Filters ----------------------------------------------------- */
134
135/* stuff to implement simple "bloom filters" for Unicode characters.
136 to keep things simple, we use a single bitmask, using the least 5
137 bits from each unicode characters as the bit index. */
138
139/* the linebreak mask is set up by Unicode_Init below */
140
141#define BLOOM_MASK unsigned long
142
143static BLOOM_MASK bloom_linebreak;
144
145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
146
147#define BLOOM_LINEBREAK(ch)\
148 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
149
150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
151{
152 /* calculate simple bloom-style bitmask for a given unicode string */
153
154 long mask;
155 Py_ssize_t i;
156
157 mask = 0;
158 for (i = 0; i < len; i++)
159 mask |= (1 << (ptr[i] & 0x1F));
160
161 return mask;
162}
163
164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
165{
166 Py_ssize_t i;
167
168 for (i = 0; i < setlen; i++)
169 if (set[i] == chr)
170 return 1;
171
172 return 0;
173}
174
175#define BLOOM_MEMBER(mask, chr, set, setlen)\
176 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
177
Guido van Rossumd57fd912000-03-10 22:53:23 +0000178/* --- Unicode Object ----------------------------------------------------- */
179
180static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000182 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183{
184 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000185
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000186 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190 /* Resizing shared object (unicode_empty or single character
191 objects) in-place is not allowed. Use PyUnicode_Resize()
192 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000194 if (unicode == unicode_empty ||
195 (unicode->length == 1 &&
196 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 return -1;
201 }
202
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203 /* We allocate one more byte to make sure the string is Ux0000 terminated.
204 The overallocation is also used by fastsearch, which assumes that it's
205 safe to look at str[length] (without making any assumptions about what
206 it contains). */
207
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 oldstr = unicode->str;
209 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
210 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 PyErr_NoMemory();
213 return -1;
214 }
215 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000216 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000220 if (unicode->defenc) {
221 Py_DECREF(unicode->defenc);
222 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 }
224 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000225
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 return 0;
227}
228
229/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000230 Ux0000 terminated; some code (e.g. new_identifier)
231 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232
233 XXX This allocator could further be enhanced by assuring that the
234 free list never reduces its size below 1.
235
236*/
237
238static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000239PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000240{
241 register PyUnicodeObject *unicode;
242
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 if (length == 0 && unicode_empty != NULL) {
245 Py_INCREF(unicode_empty);
246 return unicode_empty;
247 }
248
249 /* Unicode freelist & memory allocation */
250 if (unicode_freelist) {
251 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000252 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000255 /* Keep-Alive optimization: we only upsize the buffer,
256 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000257 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000258 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000259 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000260 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 }
262 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000263 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000265 }
266 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 }
268 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000269 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000270 if (unicode == NULL)
271 return NULL;
272 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
273 }
274
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000275 if (!unicode->str) {
276 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000277 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000278 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000280 * the caller fails before initializing str -- unicode_resize()
281 * reads str[0], and the Keep-Alive optimization can keep memory
282 * allocated for str alive across a call to unicode_dealloc(unicode).
283 * We don't want unicode_resize to read uninitialized memory in
284 * that case.
285 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000286 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000288 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000290 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000291 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000293
294 onError:
295 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000296 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000297 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298}
299
300static
Guido van Rossum9475a232001-10-05 20:51:39 +0000301void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302{
Walter Dörwald16807132007-05-25 13:52:07 +0000303 switch (PyUnicode_CHECK_INTERNED(unicode)) {
304 case SSTATE_NOT_INTERNED:
305 break;
306
307 case SSTATE_INTERNED_MORTAL:
308 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000309 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000310 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
311 Py_FatalError(
312 "deletion of interned unicode string failed");
313 break;
314
315 case SSTATE_INTERNED_IMMORTAL:
316 Py_FatalError("Immortal interned unicode string died.");
317
318 default:
319 Py_FatalError("Inconsistent interned unicode string state.");
320 }
321
Guido van Rossum604ddf82001-12-06 20:03:56 +0000322 if (PyUnicode_CheckExact(unicode) &&
323 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000324 /* Keep-Alive optimization */
325 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000326 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327 unicode->str = NULL;
328 unicode->length = 0;
329 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000330 if (unicode->defenc) {
331 Py_DECREF(unicode->defenc);
332 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000333 }
334 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000335 *(PyUnicodeObject **)unicode = unicode_freelist;
336 unicode_freelist = unicode;
337 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 }
339 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000340 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000341 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000342 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 }
344}
345
Martin v. Löwis18e16552006-02-15 17:27:45 +0000346int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000347{
348 register PyUnicodeObject *v;
349
350 /* Argument checks */
351 if (unicode == NULL) {
352 PyErr_BadInternalCall();
353 return -1;
354 }
355 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000356 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000357 PyErr_BadInternalCall();
358 return -1;
359 }
360
361 /* Resizing unicode_empty and single character objects is not
362 possible since these are being shared. We simply return a fresh
363 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000364 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000365 (v == unicode_empty || v->length == 1)) {
366 PyUnicodeObject *w = _PyUnicode_New(length);
367 if (w == NULL)
368 return -1;
369 Py_UNICODE_COPY(w->str, v->str,
370 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000371 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000372 *unicode = (PyObject *)w;
373 return 0;
374 }
375
376 /* Note that we don't have to modify *unicode for unshared Unicode
377 objects, since we can modify them in-place. */
378 return unicode_resize(v, length);
379}
380
381/* Internal API for use in unicodeobject.c only ! */
382#define _PyUnicode_Resize(unicodevar, length) \
383 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000386 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387{
388 PyUnicodeObject *unicode;
389
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000390 /* If the Unicode data is known at construction time, we can apply
391 some optimizations which share commonly used objects. */
392 if (u != NULL) {
393
394 /* Optimization for empty strings */
395 if (size == 0 && unicode_empty != NULL) {
396 Py_INCREF(unicode_empty);
397 return (PyObject *)unicode_empty;
398 }
399
400 /* Single character Unicode objects in the Latin-1 range are
401 shared when using this constructor */
402 if (size == 1 && *u < 256) {
403 unicode = unicode_latin1[*u];
404 if (!unicode) {
405 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000406 if (!unicode)
407 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000408 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000409 unicode_latin1[*u] = unicode;
410 }
411 Py_INCREF(unicode);
412 return (PyObject *)unicode;
413 }
414 }
Tim Petersced69f82003-09-16 20:30:58 +0000415
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 unicode = _PyUnicode_New(size);
417 if (!unicode)
418 return NULL;
419
420 /* Copy the Unicode data into the new object */
421 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423
424 return (PyObject *)unicode;
425}
426
Walter Dörwaldd2034312007-05-18 16:29:38 +0000427PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000428{
429 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000430 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000431 some optimizations which share commonly used objects.
432 Also, this means the input must be UTF-8, so fall back to the
433 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000434 if (u != NULL) {
435
436 /* Optimization for empty strings */
437 if (size == 0 && unicode_empty != NULL) {
438 Py_INCREF(unicode_empty);
439 return (PyObject *)unicode_empty;
440 }
441
Martin v. Löwis9c121062007-08-05 20:26:11 +0000442 /* Single characters are shared when using this constructor.
443 Restrict to ASCII, since the input must be UTF-8. */
444 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000445 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000446 if (!unicode) {
447 unicode = _PyUnicode_New(1);
448 if (!unicode)
449 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000450 unicode->str[0] = Py_CHARMASK(*u);
451 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000452 }
453 Py_INCREF(unicode);
454 return (PyObject *)unicode;
455 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000456
457 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000458 }
459
Walter Dörwald55507312007-05-18 13:12:10 +0000460 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000461 if (!unicode)
462 return NULL;
463
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000464 return (PyObject *)unicode;
465}
466
Walter Dörwaldd2034312007-05-18 16:29:38 +0000467PyObject *PyUnicode_FromString(const char *u)
468{
469 size_t size = strlen(u);
470 if (size > PY_SSIZE_T_MAX) {
471 PyErr_SetString(PyExc_OverflowError, "input too long");
472 return NULL;
473 }
474
475 return PyUnicode_FromStringAndSize(u, size);
476}
477
Guido van Rossumd57fd912000-03-10 22:53:23 +0000478#ifdef HAVE_WCHAR_H
479
480PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000481 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000482{
483 PyUnicodeObject *unicode;
484
485 if (w == NULL) {
486 PyErr_BadInternalCall();
487 return NULL;
488 }
489
490 unicode = _PyUnicode_New(size);
491 if (!unicode)
492 return NULL;
493
494 /* Copy the wchar_t data into the new object */
495#ifdef HAVE_USABLE_WCHAR_T
496 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000497#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 {
499 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000500 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000502 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 *u++ = *w++;
504 }
505#endif
506
507 return (PyObject *)unicode;
508}
509
Walter Dörwald346737f2007-05-31 10:44:43 +0000510static void
511makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
512{
513 *fmt++ = '%';
514 if (width) {
515 if (zeropad)
516 *fmt++ = '0';
517 fmt += sprintf(fmt, "%d", width);
518 }
519 if (precision)
520 fmt += sprintf(fmt, ".%d", precision);
521 if (longflag)
522 *fmt++ = 'l';
523 else if (size_tflag) {
524 char *f = PY_FORMAT_SIZE_T;
525 while (*f)
526 *fmt++ = *f++;
527 }
528 *fmt++ = c;
529 *fmt = '\0';
530}
531
Walter Dörwaldd2034312007-05-18 16:29:38 +0000532#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
533
534PyObject *
535PyUnicode_FromFormatV(const char *format, va_list vargs)
536{
537 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000538 Py_ssize_t callcount = 0;
539 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000540 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000541 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000542 int width = 0;
543 int precision = 0;
544 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000545 const char* f;
546 Py_UNICODE *s;
547 PyObject *string;
548 /* used by sprintf */
549 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000550 /* use abuffer instead of buffer, if we need more space
551 * (which can happen if there's a format specifier with width). */
552 char *abuffer = NULL;
553 char *realbuffer;
554 Py_ssize_t abuffersize = 0;
555 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000556 const char *copy;
557
558#ifdef VA_LIST_IS_ARRAY
559 Py_MEMCPY(count, vargs, sizeof(va_list));
560#else
561#ifdef __va_copy
562 __va_copy(count, vargs);
563#else
564 count = vargs;
565#endif
566#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000567 /* step 1: count the number of %S/%R format specifications
568 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
569 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000570 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000571 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000572 ++callcount;
573 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000574 /* step 2: allocate memory for the results of
575 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000576 if (callcount) {
577 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
578 if (!callresults) {
579 PyErr_NoMemory();
580 return NULL;
581 }
582 callresult = callresults;
583 }
584 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000585 for (f = format; *f; f++) {
586 if (*f == '%') {
587 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000588 width = 0;
589 while (isdigit(Py_CHARMASK(*f)))
590 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000591 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
592 ;
593
594 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
595 * they don't affect the amount of space we reserve.
596 */
597 if ((*f == 'l' || *f == 'z') &&
598 (f[1] == 'd' || f[1] == 'u'))
599 ++f;
600
601 switch (*f) {
602 case 'c':
603 (void)va_arg(count, int);
604 /* fall through... */
605 case '%':
606 n++;
607 break;
608 case 'd': case 'u': case 'i': case 'x':
609 (void) va_arg(count, int);
610 /* 20 bytes is enough to hold a 64-bit
611 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000612 This isn't enough for octal.
613 If a width is specified we need more
614 (which we allocate later). */
615 if (width < 20)
616 width = 20;
617 n += width;
618 if (abuffersize < width)
619 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000620 break;
621 case 's':
622 n += strlen(va_arg(count, char*));
623 break;
624 case 'U':
625 {
626 PyObject *obj = va_arg(count, PyObject *);
627 assert(obj && PyUnicode_Check(obj));
628 n += PyUnicode_GET_SIZE(obj);
629 break;
630 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000631 case 'V':
632 {
633 PyObject *obj = va_arg(count, PyObject *);
634 const char *str = va_arg(count, const char *);
635 assert(obj || str);
636 assert(!obj || PyUnicode_Check(obj));
637 if (obj)
638 n += PyUnicode_GET_SIZE(obj);
639 else
640 n += strlen(str);
641 break;
642 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000643 case 'S':
644 {
645 PyObject *obj = va_arg(count, PyObject *);
646 PyObject *str;
647 assert(obj);
648 str = PyObject_Unicode(obj);
649 if (!str)
650 goto fail;
651 n += PyUnicode_GET_SIZE(str);
652 /* Remember the str and switch to the next slot */
653 *callresult++ = str;
654 break;
655 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000656 case 'R':
657 {
658 PyObject *obj = va_arg(count, PyObject *);
659 PyObject *repr;
660 assert(obj);
661 repr = PyObject_Repr(obj);
662 if (!repr)
663 goto fail;
664 n += PyUnicode_GET_SIZE(repr);
665 /* Remember the repr and switch to the next slot */
666 *callresult++ = repr;
667 break;
668 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000669 case 'p':
670 (void) va_arg(count, int);
671 /* maximum 64-bit pointer representation:
672 * 0xffffffffffffffff
673 * so 19 characters is enough.
674 * XXX I count 18 -- what's the extra for?
675 */
676 n += 19;
677 break;
678 default:
679 /* if we stumble upon an unknown
680 formatting code, copy the rest of
681 the format string to the output
682 string. (we cannot just skip the
683 code, since there's no way to know
684 what's in the argument list) */
685 n += strlen(p);
686 goto expand;
687 }
688 } else
689 n++;
690 }
691 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000692 if (abuffersize > 20) {
693 abuffer = PyMem_Malloc(abuffersize);
694 if (!abuffer) {
695 PyErr_NoMemory();
696 goto fail;
697 }
698 realbuffer = abuffer;
699 }
700 else
701 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000702 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000703 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000704 we don't have to resize the string.
705 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000706 string = PyUnicode_FromUnicode(NULL, n);
707 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000708 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000709
710 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000711 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000712
713 for (f = format; *f; f++) {
714 if (*f == '%') {
715 const char* p = f++;
716 int longflag = 0;
717 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000718 zeropad = (*f == '0');
719 /* parse the width.precision part */
720 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000721 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000722 width = (width*10) + *f++ - '0';
723 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724 if (*f == '.') {
725 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000726 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000727 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000729 /* handle the long flag, but only for %ld and %lu.
730 others can be added when necessary. */
731 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
732 longflag = 1;
733 ++f;
734 }
735 /* handle the size_t flag. */
736 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
737 size_tflag = 1;
738 ++f;
739 }
740
741 switch (*f) {
742 case 'c':
743 *s++ = va_arg(vargs, int);
744 break;
745 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000746 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000747 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000748 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000749 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000750 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000751 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000752 sprintf(realbuffer, fmt, va_arg(vargs, int));
753 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000754 break;
755 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000756 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000758 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000760 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000761 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000762 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
763 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000764 break;
765 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000766 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
767 sprintf(realbuffer, fmt, va_arg(vargs, int));
768 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000769 break;
770 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000771 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
772 sprintf(realbuffer, fmt, va_arg(vargs, int));
773 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000774 break;
775 case 's':
776 p = va_arg(vargs, char*);
777 appendstring(p);
778 break;
779 case 'U':
780 {
781 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000782 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
783 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
784 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000785 break;
786 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000787 case 'V':
788 {
789 PyObject *obj = va_arg(vargs, PyObject *);
790 const char *str = va_arg(vargs, const char *);
791 if (obj) {
792 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
793 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
794 s += size;
795 } else {
796 appendstring(str);
797 }
798 break;
799 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000800 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000801 case 'R':
802 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000803 Py_UNICODE *ucopy;
804 Py_ssize_t usize;
805 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000806 /* unused, since we already have the result */
807 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000808 ucopy = PyUnicode_AS_UNICODE(*callresult);
809 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000810 for (upos = 0; upos<usize;)
811 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000812 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000813 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000814 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000815 ++callresult;
816 break;
817 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000818 case 'p':
819 sprintf(buffer, "%p", va_arg(vargs, void*));
820 /* %p is ill-defined: ensure leading 0x. */
821 if (buffer[1] == 'X')
822 buffer[1] = 'x';
823 else if (buffer[1] != 'x') {
824 memmove(buffer+2, buffer, strlen(buffer)+1);
825 buffer[0] = '0';
826 buffer[1] = 'x';
827 }
828 appendstring(buffer);
829 break;
830 case '%':
831 *s++ = '%';
832 break;
833 default:
834 appendstring(p);
835 goto end;
836 }
837 } else
838 *s++ = *f;
839 }
840
841 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000842 if (callresults)
843 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000844 if (abuffer)
845 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000846 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
847 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000848 fail:
849 if (callresults) {
850 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000851 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000852 Py_DECREF(*callresult2);
853 ++callresult2;
854 }
855 PyMem_Free(callresults);
856 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000857 if (abuffer)
858 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000859 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000860}
861
862#undef appendstring
863
864PyObject *
865PyUnicode_FromFormat(const char *format, ...)
866{
867 PyObject* ret;
868 va_list vargs;
869
870#ifdef HAVE_STDARG_PROTOTYPES
871 va_start(vargs, format);
872#else
873 va_start(vargs);
874#endif
875 ret = PyUnicode_FromFormatV(format, vargs);
876 va_end(vargs);
877 return ret;
878}
879
Martin v. Löwis18e16552006-02-15 17:27:45 +0000880Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
881 wchar_t *w,
882 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883{
884 if (unicode == NULL) {
885 PyErr_BadInternalCall();
886 return -1;
887 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000888
889 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000891 size = PyUnicode_GET_SIZE(unicode) + 1;
892
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893#ifdef HAVE_USABLE_WCHAR_T
894 memcpy(w, unicode->str, size * sizeof(wchar_t));
895#else
896 {
897 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000898 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000899 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000900 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000901 *w++ = *u++;
902 }
903#endif
904
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000905 if (size > PyUnicode_GET_SIZE(unicode))
906 return PyUnicode_GET_SIZE(unicode);
907 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000908 return size;
909}
910
911#endif
912
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000913PyObject *PyUnicode_FromOrdinal(int ordinal)
914{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000915 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000916
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000917 if (ordinal < 0 || ordinal > 0x10ffff) {
918 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000919 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000920 return NULL;
921 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000922
923#ifndef Py_UNICODE_WIDE
924 if (ordinal > 0xffff) {
925 ordinal -= 0x10000;
926 s[0] = 0xD800 | (ordinal >> 10);
927 s[1] = 0xDC00 | (ordinal & 0x3FF);
928 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000929 }
930#endif
931
Hye-Shik Chang40574832004-04-06 07:24:51 +0000932 s[0] = (Py_UNICODE)ordinal;
933 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000934}
935
Guido van Rossumd57fd912000-03-10 22:53:23 +0000936PyObject *PyUnicode_FromObject(register PyObject *obj)
937{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000938 /* XXX Perhaps we should make this API an alias of
939 PyObject_Unicode() instead ?! */
940 if (PyUnicode_CheckExact(obj)) {
941 Py_INCREF(obj);
942 return obj;
943 }
944 if (PyUnicode_Check(obj)) {
945 /* For a Unicode subtype that's not a Unicode object,
946 return a true Unicode object with the same data. */
947 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
948 PyUnicode_GET_SIZE(obj));
949 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000950 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
951}
952
953PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
954 const char *encoding,
955 const char *errors)
956{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000957 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000958 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000959 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000960
Guido van Rossumd57fd912000-03-10 22:53:23 +0000961 if (obj == NULL) {
962 PyErr_BadInternalCall();
963 return NULL;
964 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000965
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000966#if 0
967 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000968 that no encodings is given and then redirect to
969 PyObject_Unicode() which then applies the additional logic for
970 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000971
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000972 NOTE: This API should really only be used for object which
973 represent *encoded* Unicode !
974
975 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000976 if (PyUnicode_Check(obj)) {
977 if (encoding) {
978 PyErr_SetString(PyExc_TypeError,
979 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000980 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000981 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000982 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000983 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000984#else
985 if (PyUnicode_Check(obj)) {
986 PyErr_SetString(PyExc_TypeError,
987 "decoding Unicode is not supported");
988 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000989 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000990#endif
991
992 /* Coerce object */
993 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000994 s = PyString_AS_STRING(obj);
995 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000996 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000997 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
998 /* Overwrite the error message with something more useful in
999 case of a TypeError. */
1000 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001001 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001002 "coercing to Unicode: need string or buffer, "
1003 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001004 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001005 goto onError;
1006 }
Tim Petersced69f82003-09-16 20:30:58 +00001007
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001008 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001009 if (len == 0) {
1010 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001011 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001012 }
Tim Petersced69f82003-09-16 20:30:58 +00001013 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001014 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001015
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001016 return v;
1017
1018 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001019 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001020}
1021
1022PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001023 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024 const char *encoding,
1025 const char *errors)
1026{
1027 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001028
1029 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001030 encoding = PyUnicode_GetDefaultEncoding();
1031
1032 /* Shortcuts for common default encodings */
1033 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001035 else if (strcmp(encoding, "latin-1") == 0)
1036 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001037#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1038 else if (strcmp(encoding, "mbcs") == 0)
1039 return PyUnicode_DecodeMBCS(s, size, errors);
1040#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001041 else if (strcmp(encoding, "ascii") == 0)
1042 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043
1044 /* Decode via the codec registry */
1045 buffer = PyBuffer_FromMemory((void *)s, size);
1046 if (buffer == NULL)
1047 goto onError;
1048 unicode = PyCodec_Decode(buffer, encoding, errors);
1049 if (unicode == NULL)
1050 goto onError;
1051 if (!PyUnicode_Check(unicode)) {
1052 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001053 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001054 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001055 Py_DECREF(unicode);
1056 goto onError;
1057 }
1058 Py_DECREF(buffer);
1059 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001060
Guido van Rossumd57fd912000-03-10 22:53:23 +00001061 onError:
1062 Py_XDECREF(buffer);
1063 return NULL;
1064}
1065
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001066PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1067 const char *encoding,
1068 const char *errors)
1069{
1070 PyObject *v;
1071
1072 if (!PyUnicode_Check(unicode)) {
1073 PyErr_BadArgument();
1074 goto onError;
1075 }
1076
1077 if (encoding == NULL)
1078 encoding = PyUnicode_GetDefaultEncoding();
1079
1080 /* Decode via the codec registry */
1081 v = PyCodec_Decode(unicode, encoding, errors);
1082 if (v == NULL)
1083 goto onError;
1084 return v;
1085
1086 onError:
1087 return NULL;
1088}
1089
Guido van Rossumd57fd912000-03-10 22:53:23 +00001090PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001091 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092 const char *encoding,
1093 const char *errors)
1094{
1095 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001096
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097 unicode = PyUnicode_FromUnicode(s, size);
1098 if (unicode == NULL)
1099 return NULL;
1100 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1101 Py_DECREF(unicode);
1102 return v;
1103}
1104
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001105PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1106 const char *encoding,
1107 const char *errors)
1108{
1109 PyObject *v;
1110
1111 if (!PyUnicode_Check(unicode)) {
1112 PyErr_BadArgument();
1113 goto onError;
1114 }
1115
1116 if (encoding == NULL)
1117 encoding = PyUnicode_GetDefaultEncoding();
1118
1119 /* Encode via the codec registry */
1120 v = PyCodec_Encode(unicode, encoding, errors);
1121 if (v == NULL)
1122 goto onError;
1123 return v;
1124
1125 onError:
1126 return NULL;
1127}
1128
Guido van Rossumd57fd912000-03-10 22:53:23 +00001129PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1130 const char *encoding,
1131 const char *errors)
1132{
1133 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001134
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135 if (!PyUnicode_Check(unicode)) {
1136 PyErr_BadArgument();
1137 goto onError;
1138 }
Fred Drakee4315f52000-05-09 19:53:39 +00001139
Tim Petersced69f82003-09-16 20:30:58 +00001140 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001141 encoding = PyUnicode_GetDefaultEncoding();
1142
1143 /* Shortcuts for common default encodings */
1144 if (errors == NULL) {
1145 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001146 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001147 else if (strcmp(encoding, "latin-1") == 0)
1148 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001149#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1150 else if (strcmp(encoding, "mbcs") == 0)
1151 return PyUnicode_AsMBCSString(unicode);
1152#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001153 else if (strcmp(encoding, "ascii") == 0)
1154 return PyUnicode_AsASCIIString(unicode);
1155 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156
1157 /* Encode via the codec registry */
1158 v = PyCodec_Encode(unicode, encoding, errors);
1159 if (v == NULL)
1160 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001161 if (!PyBytes_Check(v)) {
1162 if (PyString_Check(v)) {
1163 /* Old codec, turn it into bytes */
1164 PyObject *b = PyBytes_FromObject(v);
1165 Py_DECREF(v);
1166 return b;
1167 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001169 "encoder did not return a bytes object "
1170 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1171 v->ob_type->tp_name,
1172 encoding ? encoding : "NULL",
1173 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 Py_DECREF(v);
1175 goto onError;
1176 }
1177 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001178
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 onError:
1180 return NULL;
1181}
1182
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001183PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1184 const char *errors)
1185{
1186 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001187 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001188 if (v)
1189 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001190 if (errors != NULL)
1191 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Neal Norwitzab40b302007-08-12 17:21:38 +00001192 /* XXX(nnorwitz): errors will always be NULL due to the check above.
1193 Should this check and the else be removed since it's dead code?
1194 */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001195 if (errors == NULL) {
1196 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1197 PyUnicode_GET_SIZE(unicode),
1198 NULL);
1199 }
1200 else {
1201 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1202 }
1203 if (!b)
1204 return NULL;
1205 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1206 PyBytes_Size(b));
1207 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001208 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001209 return v;
1210}
1211
Martin v. Löwis5b222132007-06-10 09:51:05 +00001212char*
1213PyUnicode_AsString(PyObject *unicode)
1214{
1215 assert(PyUnicode_Check(unicode));
1216 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1217 if (!unicode)
1218 return NULL;
1219 return PyString_AsString(unicode);
1220}
1221
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1223{
1224 if (!PyUnicode_Check(unicode)) {
1225 PyErr_BadArgument();
1226 goto onError;
1227 }
1228 return PyUnicode_AS_UNICODE(unicode);
1229
1230 onError:
1231 return NULL;
1232}
1233
Martin v. Löwis18e16552006-02-15 17:27:45 +00001234Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235{
1236 if (!PyUnicode_Check(unicode)) {
1237 PyErr_BadArgument();
1238 goto onError;
1239 }
1240 return PyUnicode_GET_SIZE(unicode);
1241
1242 onError:
1243 return -1;
1244}
1245
Thomas Wouters78890102000-07-22 19:25:51 +00001246const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001247{
1248 return unicode_default_encoding;
1249}
1250
1251int PyUnicode_SetDefaultEncoding(const char *encoding)
1252{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001253 if (strcmp(encoding, unicode_default_encoding) != 0) {
1254 PyErr_Format(PyExc_ValueError,
1255 "Can only set default encoding to %s",
1256 unicode_default_encoding);
1257 return -1;
1258 }
Fred Drakee4315f52000-05-09 19:53:39 +00001259 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001260}
1261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001262/* error handling callback helper:
1263 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001264 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001265 and adjust various state variables.
1266 return 0 on success, -1 on error
1267*/
1268
1269static
1270int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1271 const char *encoding, const char *reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001272 const char **input, const char **inend, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001273 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001274{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001275 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001276
1277 PyObject *restuple = NULL;
1278 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001279 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001280 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001281 Py_ssize_t requiredsize;
1282 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001283 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001284 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001285 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 int res = -1;
1287
1288 if (*errorHandler == NULL) {
1289 *errorHandler = PyCodec_LookupError(errors);
1290 if (*errorHandler == NULL)
1291 goto onError;
1292 }
1293
1294 if (*exceptionObject == NULL) {
1295 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001296 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001297 if (*exceptionObject == NULL)
1298 goto onError;
1299 }
1300 else {
1301 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1302 goto onError;
1303 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1304 goto onError;
1305 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1306 goto onError;
1307 }
1308
1309 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1310 if (restuple == NULL)
1311 goto onError;
1312 if (!PyTuple_Check(restuple)) {
1313 PyErr_Format(PyExc_TypeError, &argparse[4]);
1314 goto onError;
1315 }
1316 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1317 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001318
1319 /* Copy back the bytes variables, which might have been modified by the
1320 callback */
1321 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1322 if (!inputobj)
1323 goto onError;
1324 if (!PyBytes_Check(inputobj)) {
1325 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1326 }
1327 *input = PyBytes_AS_STRING(inputobj);
1328 insize = PyBytes_GET_SIZE(inputobj);
1329 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001330 /* we can DECREF safely, as the exception has another reference,
1331 so the object won't go away. */
1332 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001333
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001334 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001335 newpos = insize+newpos;
1336 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001337 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001338 goto onError;
1339 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001340
1341 /* need more space? (at least enough for what we
1342 have+the replacement+the rest of the string (starting
1343 at the new input position), so we won't have to check space
1344 when there are no errors in the rest of the string) */
1345 repptr = PyUnicode_AS_UNICODE(repunicode);
1346 repsize = PyUnicode_GET_SIZE(repunicode);
1347 requiredsize = *outpos + repsize + insize-newpos;
1348 if (requiredsize > outsize) {
1349 if (requiredsize<2*outsize)
1350 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001351 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001352 goto onError;
1353 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1354 }
1355 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001356 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001357 Py_UNICODE_COPY(*outptr, repptr, repsize);
1358 *outptr += repsize;
1359 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001360
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001361 /* we made it! */
1362 res = 0;
1363
1364 onError:
1365 Py_XDECREF(restuple);
1366 return res;
1367}
1368
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001369/* --- UTF-7 Codec -------------------------------------------------------- */
1370
1371/* see RFC2152 for details */
1372
Tim Petersced69f82003-09-16 20:30:58 +00001373static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001374char utf7_special[128] = {
1375 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1376 encoded:
1377 0 - not special
1378 1 - special
1379 2 - whitespace (optional)
1380 3 - RFC2152 Set O (optional) */
1381 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1382 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1383 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1384 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1385 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1386 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1387 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1388 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1389
1390};
1391
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001392/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1393 warnings about the comparison always being false; since
1394 utf7_special[0] is 1, we can safely make that one comparison
1395 true */
1396
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001397#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001398 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001399 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001400 (encodeO && (utf7_special[(c)] == 3)))
1401
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001402#define B64(n) \
1403 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1404#define B64CHAR(c) \
1405 (isalnum(c) || (c) == '+' || (c) == '/')
1406#define UB64(c) \
1407 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1408 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001409
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001410#define ENCODE(out, ch, bits) \
1411 while (bits >= 6) { \
1412 *out++ = B64(ch >> (bits-6)); \
1413 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001414 }
1415
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001416#define DECODE(out, ch, bits, surrogate) \
1417 while (bits >= 16) { \
1418 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1419 bits -= 16; \
1420 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001421 /* We have already generated an error for the high surrogate \
1422 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001423 surrogate = 0; \
1424 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001425 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001426 it in a 16-bit character */ \
1427 surrogate = 1; \
1428 errmsg = "code pairs are not supported"; \
1429 goto utf7Error; \
1430 } else { \
1431 *out++ = outCh; \
1432 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001433 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001434
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001435PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001436 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001437 const char *errors)
1438{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001439 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001440 Py_ssize_t startinpos;
1441 Py_ssize_t endinpos;
1442 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001443 const char *e;
1444 PyUnicodeObject *unicode;
1445 Py_UNICODE *p;
1446 const char *errmsg = "";
1447 int inShift = 0;
1448 unsigned int bitsleft = 0;
1449 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001450 int surrogate = 0;
1451 PyObject *errorHandler = NULL;
1452 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001453
1454 unicode = _PyUnicode_New(size);
1455 if (!unicode)
1456 return NULL;
1457 if (size == 0)
1458 return (PyObject *)unicode;
1459
1460 p = unicode->str;
1461 e = s + size;
1462
1463 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001464 Py_UNICODE ch;
1465 restart:
1466 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001467
1468 if (inShift) {
1469 if ((ch == '-') || !B64CHAR(ch)) {
1470 inShift = 0;
1471 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001472
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001473 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1474 if (bitsleft >= 6) {
1475 /* The shift sequence has a partial character in it. If
1476 bitsleft < 6 then we could just classify it as padding
1477 but that is not the case here */
1478
1479 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001480 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001481 }
1482 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001483 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001484 here so indicate the potential of a misencoded character. */
1485
1486 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1487 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1488 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001489 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001490 }
1491
1492 if (ch == '-') {
1493 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001494 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001495 inShift = 1;
1496 }
1497 } else if (SPECIAL(ch,0,0)) {
1498 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001499 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001500 } else {
1501 *p++ = ch;
1502 }
1503 } else {
1504 charsleft = (charsleft << 6) | UB64(ch);
1505 bitsleft += 6;
1506 s++;
1507 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1508 }
1509 }
1510 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001511 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001512 s++;
1513 if (s < e && *s == '-') {
1514 s++;
1515 *p++ = '+';
1516 } else
1517 {
1518 inShift = 1;
1519 bitsleft = 0;
1520 }
1521 }
1522 else if (SPECIAL(ch,0,0)) {
1523 errmsg = "unexpected special character";
1524 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001525 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001526 }
1527 else {
1528 *p++ = ch;
1529 s++;
1530 }
1531 continue;
1532 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001533 outpos = p-PyUnicode_AS_UNICODE(unicode);
1534 endinpos = s-starts;
1535 if (unicode_decode_call_errorhandler(
1536 errors, &errorHandler,
1537 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001538 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001539 (PyObject **)&unicode, &outpos, &p))
1540 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001541 }
1542
1543 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001544 outpos = p-PyUnicode_AS_UNICODE(unicode);
1545 endinpos = size;
1546 if (unicode_decode_call_errorhandler(
1547 errors, &errorHandler,
1548 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001549 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001550 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001551 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001552 if (s < e)
1553 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001554 }
1555
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001556 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001557 goto onError;
1558
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001559 Py_XDECREF(errorHandler);
1560 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001561 return (PyObject *)unicode;
1562
1563onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001564 Py_XDECREF(errorHandler);
1565 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001566 Py_DECREF(unicode);
1567 return NULL;
1568}
1569
1570
1571PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001572 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001573 int encodeSetO,
1574 int encodeWhiteSpace,
1575 const char *errors)
1576{
1577 PyObject *v;
1578 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001579 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001580 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001581 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001582 unsigned int bitsleft = 0;
1583 unsigned long charsleft = 0;
1584 char * out;
1585 char * start;
1586
1587 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001588 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001589
Walter Dörwald51ab4142007-05-05 14:43:36 +00001590 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001591 if (v == NULL)
1592 return NULL;
1593
Walter Dörwald51ab4142007-05-05 14:43:36 +00001594 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001595 for (;i < size; ++i) {
1596 Py_UNICODE ch = s[i];
1597
1598 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001599 if (ch == '+') {
1600 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001601 *out++ = '-';
1602 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1603 charsleft = ch;
1604 bitsleft = 16;
1605 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001606 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001607 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001608 } else {
1609 *out++ = (char) ch;
1610 }
1611 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001612 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1613 *out++ = B64(charsleft << (6-bitsleft));
1614 charsleft = 0;
1615 bitsleft = 0;
1616 /* Characters not in the BASE64 set implicitly unshift the sequence
1617 so no '-' is required, except if the character is itself a '-' */
1618 if (B64CHAR(ch) || ch == '-') {
1619 *out++ = '-';
1620 }
1621 inShift = 0;
1622 *out++ = (char) ch;
1623 } else {
1624 bitsleft += 16;
1625 charsleft = (charsleft << 16) | ch;
1626 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1627
1628 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001629 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001630 or '-' then the shift sequence will be terminated implicitly and we
1631 don't have to insert a '-'. */
1632
1633 if (bitsleft == 0) {
1634 if (i + 1 < size) {
1635 Py_UNICODE ch2 = s[i+1];
1636
1637 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001638
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001639 } else if (B64CHAR(ch2) || ch2 == '-') {
1640 *out++ = '-';
1641 inShift = 0;
1642 } else {
1643 inShift = 0;
1644 }
1645
1646 }
1647 else {
1648 *out++ = '-';
1649 inShift = 0;
1650 }
1651 }
Tim Petersced69f82003-09-16 20:30:58 +00001652 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001654 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001655 if (bitsleft) {
1656 *out++= B64(charsleft << (6-bitsleft) );
1657 *out++ = '-';
1658 }
1659
Walter Dörwald51ab4142007-05-05 14:43:36 +00001660 if (PyBytes_Resize(v, out - start)) {
1661 Py_DECREF(v);
1662 return NULL;
1663 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001664 return v;
1665}
1666
1667#undef SPECIAL
1668#undef B64
1669#undef B64CHAR
1670#undef UB64
1671#undef ENCODE
1672#undef DECODE
1673
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674/* --- UTF-8 Codec -------------------------------------------------------- */
1675
Tim Petersced69f82003-09-16 20:30:58 +00001676static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001677char utf8_code_length[256] = {
1678 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1679 illegal prefix. see RFC 2279 for details */
1680 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1681 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1682 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1683 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1684 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1685 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1686 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1687 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1689 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1690 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1691 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1692 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1693 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1694 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1695 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1696};
1697
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001699 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700 const char *errors)
1701{
Walter Dörwald69652032004-09-07 20:24:22 +00001702 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1703}
1704
1705PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001706 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001707 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001708 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001709{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001710 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001711 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001712 Py_ssize_t startinpos;
1713 Py_ssize_t endinpos;
1714 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715 const char *e;
1716 PyUnicodeObject *unicode;
1717 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001718 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001719 PyObject *errorHandler = NULL;
1720 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001721
1722 /* Note: size will always be longer than the resulting Unicode
1723 character count */
1724 unicode = _PyUnicode_New(size);
1725 if (!unicode)
1726 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001727 if (size == 0) {
1728 if (consumed)
1729 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732
1733 /* Unpack UTF-8 encoded data */
1734 p = unicode->str;
1735 e = s + size;
1736
1737 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001738 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739
1740 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001741 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001742 s++;
1743 continue;
1744 }
1745
1746 n = utf8_code_length[ch];
1747
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001748 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001749 if (consumed)
1750 break;
1751 else {
1752 errmsg = "unexpected end of data";
1753 startinpos = s-starts;
1754 endinpos = size;
1755 goto utf8Error;
1756 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001757 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758
1759 switch (n) {
1760
1761 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001762 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001763 startinpos = s-starts;
1764 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001765 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766
1767 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001768 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001769 startinpos = s-starts;
1770 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001771 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772
1773 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001774 if ((s[1] & 0xc0) != 0x80) {
1775 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001776 startinpos = s-starts;
1777 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001778 goto utf8Error;
1779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001781 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001782 startinpos = s-starts;
1783 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001784 errmsg = "illegal encoding";
1785 goto utf8Error;
1786 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001788 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001789 break;
1790
1791 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001792 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001793 (s[2] & 0xc0) != 0x80) {
1794 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001795 startinpos = s-starts;
1796 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001797 goto utf8Error;
1798 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001800 if (ch < 0x0800) {
1801 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001802 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001803
1804 XXX For wide builds (UCS-4) we should probably try
1805 to recombine the surrogates into a single code
1806 unit.
1807 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001808 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001809 startinpos = s-starts;
1810 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001811 goto utf8Error;
1812 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001813 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001814 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001815 break;
1816
1817 case 4:
1818 if ((s[1] & 0xc0) != 0x80 ||
1819 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001820 (s[3] & 0xc0) != 0x80) {
1821 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001822 startinpos = s-starts;
1823 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001824 goto utf8Error;
1825 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001826 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1827 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1828 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001829 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001830 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001831 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001832 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001833 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001834 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001835 startinpos = s-starts;
1836 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001837 goto utf8Error;
1838 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001839#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001840 *p++ = (Py_UNICODE)ch;
1841#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001842 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001843
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001844 /* translate from 10000..10FFFF to 0..FFFF */
1845 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001846
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001847 /* high surrogate = top 10 bits added to D800 */
1848 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001849
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001850 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001851 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001852#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001853 break;
1854
1855 default:
1856 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001857 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 startinpos = s-starts;
1859 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001860 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001861 }
1862 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001863 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001864
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001865 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001866 outpos = p-PyUnicode_AS_UNICODE(unicode);
1867 if (unicode_decode_call_errorhandler(
1868 errors, &errorHandler,
1869 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001870 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001871 (PyObject **)&unicode, &outpos, &p))
1872 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873 }
Walter Dörwald69652032004-09-07 20:24:22 +00001874 if (consumed)
1875 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876
1877 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001878 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001879 goto onError;
1880
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001881 Py_XDECREF(errorHandler);
1882 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883 return (PyObject *)unicode;
1884
1885onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001886 Py_XDECREF(errorHandler);
1887 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888 Py_DECREF(unicode);
1889 return NULL;
1890}
1891
Tim Peters602f7402002-04-27 18:03:26 +00001892/* Allocation strategy: if the string is short, convert into a stack buffer
1893 and allocate exactly as much space needed at the end. Else allocate the
1894 maximum possible needed (4 result bytes per Unicode character), and return
1895 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001896*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001897PyObject *
1898PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001899 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001900 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001901{
Tim Peters602f7402002-04-27 18:03:26 +00001902#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001903
Martin v. Löwis18e16552006-02-15 17:27:45 +00001904 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001905 PyObject *v; /* result string object */
1906 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001907 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001908 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001909 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001910
Tim Peters602f7402002-04-27 18:03:26 +00001911 assert(s != NULL);
1912 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913
Tim Peters602f7402002-04-27 18:03:26 +00001914 if (size <= MAX_SHORT_UNICHARS) {
1915 /* Write into the stack buffer; nallocated can't overflow.
1916 * At the end, we'll allocate exactly as much heap space as it
1917 * turns out we need.
1918 */
1919 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1920 v = NULL; /* will allocate after we're done */
1921 p = stackbuf;
1922 }
1923 else {
1924 /* Overallocate on the heap, and give the excess back at the end. */
1925 nallocated = size * 4;
1926 if (nallocated / 4 != size) /* overflow! */
1927 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001928 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001929 if (v == NULL)
1930 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001931 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001932 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001933
Tim Peters602f7402002-04-27 18:03:26 +00001934 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001935 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001936
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001937 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001938 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001940
Guido van Rossumd57fd912000-03-10 22:53:23 +00001941 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001942 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001943 *p++ = (char)(0xc0 | (ch >> 6));
1944 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001945 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001946 else {
Tim Peters602f7402002-04-27 18:03:26 +00001947 /* Encode UCS2 Unicode ordinals */
1948 if (ch < 0x10000) {
1949 /* Special case: check for high surrogate */
1950 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1951 Py_UCS4 ch2 = s[i];
1952 /* Check for low surrogate and combine the two to
1953 form a UCS4 value */
1954 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001955 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001956 i++;
1957 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001958 }
Tim Peters602f7402002-04-27 18:03:26 +00001959 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001960 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001961 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001962 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1963 *p++ = (char)(0x80 | (ch & 0x3f));
1964 continue;
1965 }
1966encodeUCS4:
1967 /* Encode UCS4 Unicode ordinals */
1968 *p++ = (char)(0xf0 | (ch >> 18));
1969 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1970 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1971 *p++ = (char)(0x80 | (ch & 0x3f));
1972 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001974
Tim Peters602f7402002-04-27 18:03:26 +00001975 if (v == NULL) {
1976 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001977 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001978 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001979 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001980 }
1981 else {
1982 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001983 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001984 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001985 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001988
Tim Peters602f7402002-04-27 18:03:26 +00001989#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990}
1991
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1993{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994 if (!PyUnicode_Check(unicode)) {
1995 PyErr_BadArgument();
1996 return NULL;
1997 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001998 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1999 PyUnicode_GET_SIZE(unicode),
2000 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001}
2002
2003/* --- UTF-16 Codec ------------------------------------------------------- */
2004
Tim Peters772747b2001-08-09 22:21:55 +00002005PyObject *
2006PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002007 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002008 const char *errors,
2009 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010{
Walter Dörwald69652032004-09-07 20:24:22 +00002011 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2012}
2013
2014PyObject *
2015PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002016 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002017 const char *errors,
2018 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002019 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002020{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002021 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002022 Py_ssize_t startinpos;
2023 Py_ssize_t endinpos;
2024 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002025 PyUnicodeObject *unicode;
2026 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002027 const unsigned char *q, *e;
2028 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002029 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002030 /* Offsets from q for retrieving byte pairs in the right order. */
2031#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2032 int ihi = 1, ilo = 0;
2033#else
2034 int ihi = 0, ilo = 1;
2035#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002036 PyObject *errorHandler = NULL;
2037 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038
2039 /* Note: size will always be longer than the resulting Unicode
2040 character count */
2041 unicode = _PyUnicode_New(size);
2042 if (!unicode)
2043 return NULL;
2044 if (size == 0)
2045 return (PyObject *)unicode;
2046
2047 /* Unpack UTF-16 encoded data */
2048 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002049 q = (unsigned char *)s;
2050 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051
2052 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002053 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002055 /* Check for BOM marks (U+FEFF) in the input and adjust current
2056 byte order setting accordingly. In native mode, the leading BOM
2057 mark is skipped, in all other modes, it is copied to the output
2058 stream as-is (giving a ZWNBSP character). */
2059 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002060 if (size >= 2) {
2061 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002062#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002063 if (bom == 0xFEFF) {
2064 q += 2;
2065 bo = -1;
2066 }
2067 else if (bom == 0xFFFE) {
2068 q += 2;
2069 bo = 1;
2070 }
Tim Petersced69f82003-09-16 20:30:58 +00002071#else
Walter Dörwald69652032004-09-07 20:24:22 +00002072 if (bom == 0xFEFF) {
2073 q += 2;
2074 bo = 1;
2075 }
2076 else if (bom == 0xFFFE) {
2077 q += 2;
2078 bo = -1;
2079 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002080#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002081 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002082 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002083
Tim Peters772747b2001-08-09 22:21:55 +00002084 if (bo == -1) {
2085 /* force LE */
2086 ihi = 1;
2087 ilo = 0;
2088 }
2089 else if (bo == 1) {
2090 /* force BE */
2091 ihi = 0;
2092 ilo = 1;
2093 }
2094
2095 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002096 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002097 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002098 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002099 if (consumed)
2100 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002101 errmsg = "truncated data";
2102 startinpos = ((const char *)q)-starts;
2103 endinpos = ((const char *)e)-starts;
2104 goto utf16Error;
2105 /* The remaining input chars are ignored if the callback
2106 chooses to skip the input */
2107 }
2108 ch = (q[ihi] << 8) | q[ilo];
2109
Tim Peters772747b2001-08-09 22:21:55 +00002110 q += 2;
2111
Guido van Rossumd57fd912000-03-10 22:53:23 +00002112 if (ch < 0xD800 || ch > 0xDFFF) {
2113 *p++ = ch;
2114 continue;
2115 }
2116
2117 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002118 if (q >= e) {
2119 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002120 startinpos = (((const char *)q)-2)-starts;
2121 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002122 goto utf16Error;
2123 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002124 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002125 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2126 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002127 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002128#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002129 *p++ = ch;
2130 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002131#else
2132 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002133#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002134 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002135 }
2136 else {
2137 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002138 startinpos = (((const char *)q)-4)-starts;
2139 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002140 goto utf16Error;
2141 }
2142
Guido van Rossumd57fd912000-03-10 22:53:23 +00002143 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002144 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002145 startinpos = (((const char *)q)-2)-starts;
2146 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002147 /* Fall through to report the error */
2148
2149 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002150 outpos = p-PyUnicode_AS_UNICODE(unicode);
2151 if (unicode_decode_call_errorhandler(
2152 errors, &errorHandler,
2153 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002154 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002155 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002156 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002157 }
2158
2159 if (byteorder)
2160 *byteorder = bo;
2161
Walter Dörwald69652032004-09-07 20:24:22 +00002162 if (consumed)
2163 *consumed = (const char *)q-starts;
2164
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002166 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 goto onError;
2168
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002169 Py_XDECREF(errorHandler);
2170 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002171 return (PyObject *)unicode;
2172
2173onError:
2174 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002175 Py_XDECREF(errorHandler);
2176 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177 return NULL;
2178}
2179
Tim Peters772747b2001-08-09 22:21:55 +00002180PyObject *
2181PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002182 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002183 const char *errors,
2184 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002185{
2186 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002187 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002188#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002189 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002190#else
2191 const int pairs = 0;
2192#endif
Tim Peters772747b2001-08-09 22:21:55 +00002193 /* Offsets from p for storing byte pairs in the right order. */
2194#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2195 int ihi = 1, ilo = 0;
2196#else
2197 int ihi = 0, ilo = 1;
2198#endif
2199
2200#define STORECHAR(CH) \
2201 do { \
2202 p[ihi] = ((CH) >> 8) & 0xff; \
2203 p[ilo] = (CH) & 0xff; \
2204 p += 2; \
2205 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002207#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002208 for (i = pairs = 0; i < size; i++)
2209 if (s[i] >= 0x10000)
2210 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002211#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002212 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002213 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214 if (v == NULL)
2215 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216
Walter Dörwald3cc34522007-05-04 10:48:27 +00002217 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002218 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002219 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002220 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002221 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002222
2223 if (byteorder == -1) {
2224 /* force LE */
2225 ihi = 1;
2226 ilo = 0;
2227 }
2228 else if (byteorder == 1) {
2229 /* force BE */
2230 ihi = 0;
2231 ilo = 1;
2232 }
2233
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002234 while (size-- > 0) {
2235 Py_UNICODE ch = *s++;
2236 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002237#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002238 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002239 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2240 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002242#endif
Tim Peters772747b2001-08-09 22:21:55 +00002243 STORECHAR(ch);
2244 if (ch2)
2245 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002246 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002248#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249}
2250
2251PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2252{
2253 if (!PyUnicode_Check(unicode)) {
2254 PyErr_BadArgument();
2255 return NULL;
2256 }
2257 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2258 PyUnicode_GET_SIZE(unicode),
2259 NULL,
2260 0);
2261}
2262
2263/* --- Unicode Escape Codec ----------------------------------------------- */
2264
Fredrik Lundh06d12682001-01-24 07:59:11 +00002265static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002266
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002268 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269 const char *errors)
2270{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002271 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002272 Py_ssize_t startinpos;
2273 Py_ssize_t endinpos;
2274 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002275 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002277 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002278 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002279 char* message;
2280 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002281 PyObject *errorHandler = NULL;
2282 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002283
Guido van Rossumd57fd912000-03-10 22:53:23 +00002284 /* Escaped strings will always be longer than the resulting
2285 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002286 length after conversion to the true value.
2287 (but if the error callback returns a long replacement string
2288 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289 v = _PyUnicode_New(size);
2290 if (v == NULL)
2291 goto onError;
2292 if (size == 0)
2293 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002294
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002295 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002297
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298 while (s < end) {
2299 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002300 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002301 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302
2303 /* Non-escape characters are interpreted as Unicode ordinals */
2304 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002305 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306 continue;
2307 }
2308
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002309 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310 /* \ - Escapes */
2311 s++;
2312 switch (*s++) {
2313
2314 /* \x escapes */
2315 case '\n': break;
2316 case '\\': *p++ = '\\'; break;
2317 case '\'': *p++ = '\''; break;
2318 case '\"': *p++ = '\"'; break;
2319 case 'b': *p++ = '\b'; break;
2320 case 'f': *p++ = '\014'; break; /* FF */
2321 case 't': *p++ = '\t'; break;
2322 case 'n': *p++ = '\n'; break;
2323 case 'r': *p++ = '\r'; break;
2324 case 'v': *p++ = '\013'; break; /* VT */
2325 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2326
2327 /* \OOO (octal) escapes */
2328 case '0': case '1': case '2': case '3':
2329 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002330 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002331 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002332 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002333 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002334 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002335 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002336 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337 break;
2338
Fredrik Lundhccc74732001-02-18 22:13:49 +00002339 /* hex escapes */
2340 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002341 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002342 digits = 2;
2343 message = "truncated \\xXX escape";
2344 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002345
Fredrik Lundhccc74732001-02-18 22:13:49 +00002346 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002347 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002348 digits = 4;
2349 message = "truncated \\uXXXX escape";
2350 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351
Fredrik Lundhccc74732001-02-18 22:13:49 +00002352 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002353 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002354 digits = 8;
2355 message = "truncated \\UXXXXXXXX escape";
2356 hexescape:
2357 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002358 outpos = p-PyUnicode_AS_UNICODE(v);
2359 if (s+digits>end) {
2360 endinpos = size;
2361 if (unicode_decode_call_errorhandler(
2362 errors, &errorHandler,
2363 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002364 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002365 (PyObject **)&v, &outpos, &p))
2366 goto onError;
2367 goto nextByte;
2368 }
2369 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002370 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002371 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002372 endinpos = (s+i+1)-starts;
2373 if (unicode_decode_call_errorhandler(
2374 errors, &errorHandler,
2375 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002376 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002377 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002378 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002379 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002380 }
2381 chr = (chr<<4) & ~0xF;
2382 if (c >= '0' && c <= '9')
2383 chr += c - '0';
2384 else if (c >= 'a' && c <= 'f')
2385 chr += 10 + c - 'a';
2386 else
2387 chr += 10 + c - 'A';
2388 }
2389 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002390 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002391 /* _decoding_error will have already written into the
2392 target buffer. */
2393 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002394 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002395 /* when we get here, chr is a 32-bit unicode character */
2396 if (chr <= 0xffff)
2397 /* UCS-2 character */
2398 *p++ = (Py_UNICODE) chr;
2399 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002400 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002401 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002402#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002403 *p++ = chr;
2404#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002405 chr -= 0x10000L;
2406 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002407 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002408#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002409 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002410 endinpos = s-starts;
2411 outpos = p-PyUnicode_AS_UNICODE(v);
2412 if (unicode_decode_call_errorhandler(
2413 errors, &errorHandler,
2414 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002415 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002416 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002417 goto onError;
2418 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002419 break;
2420
2421 /* \N{name} */
2422 case 'N':
2423 message = "malformed \\N character escape";
2424 if (ucnhash_CAPI == NULL) {
2425 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002426 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002427 m = PyImport_ImportModule("unicodedata");
2428 if (m == NULL)
2429 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002430 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002431 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002432 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002433 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002434 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002435 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002436 if (ucnhash_CAPI == NULL)
2437 goto ucnhashError;
2438 }
2439 if (*s == '{') {
2440 const char *start = s+1;
2441 /* look for the closing brace */
2442 while (*s != '}' && s < end)
2443 s++;
2444 if (s > start && s < end && *s == '}') {
2445 /* found a name. look it up in the unicode database */
2446 message = "unknown Unicode character name";
2447 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002448 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002449 goto store;
2450 }
2451 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002452 endinpos = s-starts;
2453 outpos = p-PyUnicode_AS_UNICODE(v);
2454 if (unicode_decode_call_errorhandler(
2455 errors, &errorHandler,
2456 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002457 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002458 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002459 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002460 break;
2461
2462 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002463 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002464 message = "\\ at end of string";
2465 s--;
2466 endinpos = s-starts;
2467 outpos = p-PyUnicode_AS_UNICODE(v);
2468 if (unicode_decode_call_errorhandler(
2469 errors, &errorHandler,
2470 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002471 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002472 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002473 goto onError;
2474 }
2475 else {
2476 *p++ = '\\';
2477 *p++ = (unsigned char)s[-1];
2478 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002479 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002481 nextByte:
2482 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002484 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002485 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002486 Py_XDECREF(errorHandler);
2487 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002488 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002489
Fredrik Lundhccc74732001-02-18 22:13:49 +00002490ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002491 PyErr_SetString(
2492 PyExc_UnicodeError,
2493 "\\N escapes not supported (can't load unicodedata module)"
2494 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002495 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002496 Py_XDECREF(errorHandler);
2497 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002498 return NULL;
2499
Fredrik Lundhccc74732001-02-18 22:13:49 +00002500onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002501 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002502 Py_XDECREF(errorHandler);
2503 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504 return NULL;
2505}
2506
2507/* Return a Unicode-Escape string version of the Unicode object.
2508
2509 If quotes is true, the string is enclosed in u"" or u'' quotes as
2510 appropriate.
2511
2512*/
2513
Thomas Wouters477c8d52006-05-27 19:21:47 +00002514Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2515 Py_ssize_t size,
2516 Py_UNICODE ch)
2517{
2518 /* like wcschr, but doesn't stop at NULL characters */
2519
2520 while (size-- > 0) {
2521 if (*s == ch)
2522 return s;
2523 s++;
2524 }
2525
2526 return NULL;
2527}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002528
Walter Dörwald79e913e2007-05-12 11:08:06 +00002529static const char *hexdigits = "0123456789abcdef";
2530
2531PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2532 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002533{
2534 PyObject *repr;
2535 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536
Thomas Wouters89f507f2006-12-13 04:49:30 +00002537 /* XXX(nnorwitz): rather than over-allocating, it would be
2538 better to choose a different scheme. Perhaps scan the
2539 first N-chars of the string and allocate based on that size.
2540 */
2541 /* Initial allocation is based on the longest-possible unichr
2542 escape.
2543
2544 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2545 unichr, so in this case it's the longest unichr escape. In
2546 narrow (UTF-16) builds this is five chars per source unichr
2547 since there are two unichrs in the surrogate pair, so in narrow
2548 (UTF-16) builds it's not the longest unichr escape.
2549
2550 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2551 so in the narrow (UTF-16) build case it's the longest unichr
2552 escape.
2553 */
2554
Walter Dörwald79e913e2007-05-12 11:08:06 +00002555 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002556#ifdef Py_UNICODE_WIDE
2557 + 10*size
2558#else
2559 + 6*size
2560#endif
2561 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562 if (repr == NULL)
2563 return NULL;
2564
Walter Dörwald79e913e2007-05-12 11:08:06 +00002565 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566
Guido van Rossumd57fd912000-03-10 22:53:23 +00002567 while (size-- > 0) {
2568 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002569
Walter Dörwald79e913e2007-05-12 11:08:06 +00002570 /* Escape backslashes */
2571 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572 *p++ = '\\';
2573 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002574 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002575 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002576
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002577#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002578 /* Map 21-bit characters to '\U00xxxxxx' */
2579 else if (ch >= 0x10000) {
2580 *p++ = '\\';
2581 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002582 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2583 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2584 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2585 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2586 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2587 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2588 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2589 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002590 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002591 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002592#else
2593 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002594 else if (ch >= 0xD800 && ch < 0xDC00) {
2595 Py_UNICODE ch2;
2596 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002597
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002598 ch2 = *s++;
2599 size--;
2600 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2601 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2602 *p++ = '\\';
2603 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002604 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2605 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2606 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2607 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2608 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2609 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2610 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2611 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002612 continue;
2613 }
2614 /* Fall through: isolated surrogates are copied as-is */
2615 s--;
2616 size++;
2617 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002618#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002619
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002621 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002622 *p++ = '\\';
2623 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002624 *p++ = hexdigits[(ch >> 12) & 0x000F];
2625 *p++ = hexdigits[(ch >> 8) & 0x000F];
2626 *p++ = hexdigits[(ch >> 4) & 0x000F];
2627 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002629
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002630 /* Map special whitespace to '\t', \n', '\r' */
2631 else if (ch == '\t') {
2632 *p++ = '\\';
2633 *p++ = 't';
2634 }
2635 else if (ch == '\n') {
2636 *p++ = '\\';
2637 *p++ = 'n';
2638 }
2639 else if (ch == '\r') {
2640 *p++ = '\\';
2641 *p++ = 'r';
2642 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002643
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002644 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002645 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002646 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002647 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002648 *p++ = hexdigits[(ch >> 4) & 0x000F];
2649 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002650 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002651
Guido van Rossumd57fd912000-03-10 22:53:23 +00002652 /* Copy everything else as-is */
2653 else
2654 *p++ = (char) ch;
2655 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002656
2657 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002658 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2659 Py_DECREF(repr);
2660 return NULL;
2661 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662 return repr;
2663}
2664
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2666{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002667 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668 if (!PyUnicode_Check(unicode)) {
2669 PyErr_BadArgument();
2670 return NULL;
2671 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002672 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2673 PyUnicode_GET_SIZE(unicode));
2674
2675 if (!s)
2676 return NULL;
2677 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2678 PyBytes_GET_SIZE(s));
2679 Py_DECREF(s);
2680 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681}
2682
2683/* --- Raw Unicode Escape Codec ------------------------------------------- */
2684
2685PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002686 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002687 const char *errors)
2688{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002689 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002690 Py_ssize_t startinpos;
2691 Py_ssize_t endinpos;
2692 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002693 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002694 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695 const char *end;
2696 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002697 PyObject *errorHandler = NULL;
2698 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002699
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700 /* Escaped strings will always be longer than the resulting
2701 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002702 length after conversion to the true value. (But decoding error
2703 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704 v = _PyUnicode_New(size);
2705 if (v == NULL)
2706 goto onError;
2707 if (size == 0)
2708 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002709 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710 end = s + size;
2711 while (s < end) {
2712 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002713 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002715 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716
2717 /* Non-escape characters are interpreted as Unicode ordinals */
2718 if (*s != '\\') {
2719 *p++ = (unsigned char)*s++;
2720 continue;
2721 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002722 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723
2724 /* \u-escapes are only interpreted iff the number of leading
2725 backslashes if odd */
2726 bs = s;
2727 for (;s < end;) {
2728 if (*s != '\\')
2729 break;
2730 *p++ = (unsigned char)*s++;
2731 }
2732 if (((s - bs) & 1) == 0 ||
2733 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002734 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 continue;
2736 }
2737 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002738 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739 s++;
2740
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002741 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002743 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 endinpos = s-starts;
2747 if (unicode_decode_call_errorhandler(
2748 errors, &errorHandler,
2749 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002750 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002751 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002753 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754 }
2755 x = (x<<4) & ~0xF;
2756 if (c >= '0' && c <= '9')
2757 x += c - '0';
2758 else if (c >= 'a' && c <= 'f')
2759 x += 10 + c - 'a';
2760 else
2761 x += 10 + c - 'A';
2762 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002763#ifndef Py_UNICODE_WIDE
2764 if (x > 0x10000) {
2765 if (unicode_decode_call_errorhandler(
2766 errors, &errorHandler,
2767 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002768 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002769 (PyObject **)&v, &outpos, &p))
2770 goto onError;
2771 }
2772#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002773 *p++ = x;
2774 nextByte:
2775 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002777 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002778 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002779 Py_XDECREF(errorHandler);
2780 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002782
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783 onError:
2784 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002785 Py_XDECREF(errorHandler);
2786 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787 return NULL;
2788}
2789
2790PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002791 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792{
2793 PyObject *repr;
2794 char *p;
2795 char *q;
2796
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002797#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002798 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002799#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002800 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002801#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802 if (repr == NULL)
2803 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002804 if (size == 0)
2805 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806
Walter Dörwald711005d2007-05-12 12:03:26 +00002807 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 while (size-- > 0) {
2809 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002810#ifdef Py_UNICODE_WIDE
2811 /* Map 32-bit characters to '\Uxxxxxxxx' */
2812 if (ch >= 0x10000) {
2813 *p++ = '\\';
2814 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002815 *p++ = hexdigits[(ch >> 28) & 0xf];
2816 *p++ = hexdigits[(ch >> 24) & 0xf];
2817 *p++ = hexdigits[(ch >> 20) & 0xf];
2818 *p++ = hexdigits[(ch >> 16) & 0xf];
2819 *p++ = hexdigits[(ch >> 12) & 0xf];
2820 *p++ = hexdigits[(ch >> 8) & 0xf];
2821 *p++ = hexdigits[(ch >> 4) & 0xf];
2822 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002823 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002824 else
2825#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 /* Map 16-bit characters to '\uxxxx' */
2827 if (ch >= 256) {
2828 *p++ = '\\';
2829 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002830 *p++ = hexdigits[(ch >> 12) & 0xf];
2831 *p++ = hexdigits[(ch >> 8) & 0xf];
2832 *p++ = hexdigits[(ch >> 4) & 0xf];
2833 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 }
2835 /* Copy everything else as-is */
2836 else
2837 *p++ = (char) ch;
2838 }
2839 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002840 if (PyBytes_Resize(repr, p - q)) {
2841 Py_DECREF(repr);
2842 return NULL;
2843 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002844 return repr;
2845}
2846
2847PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2848{
Walter Dörwald711005d2007-05-12 12:03:26 +00002849 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002851 PyErr_BadArgument();
2852 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002854 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2855 PyUnicode_GET_SIZE(unicode));
2856
2857 if (!s)
2858 return NULL;
2859 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2860 PyBytes_GET_SIZE(s));
2861 Py_DECREF(s);
2862 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002863}
2864
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002865/* --- Unicode Internal Codec ------------------------------------------- */
2866
2867PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002868 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002869 const char *errors)
2870{
2871 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002872 Py_ssize_t startinpos;
2873 Py_ssize_t endinpos;
2874 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002875 PyUnicodeObject *v;
2876 Py_UNICODE *p;
2877 const char *end;
2878 const char *reason;
2879 PyObject *errorHandler = NULL;
2880 PyObject *exc = NULL;
2881
Neal Norwitzd43069c2006-01-08 01:12:10 +00002882#ifdef Py_UNICODE_WIDE
2883 Py_UNICODE unimax = PyUnicode_GetMax();
2884#endif
2885
Thomas Wouters89f507f2006-12-13 04:49:30 +00002886 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002887 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2888 if (v == NULL)
2889 goto onError;
2890 if (PyUnicode_GetSize((PyObject *)v) == 0)
2891 return (PyObject *)v;
2892 p = PyUnicode_AS_UNICODE(v);
2893 end = s + size;
2894
2895 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002896 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002897 /* We have to sanity check the raw data, otherwise doom looms for
2898 some malformed UCS-4 data. */
2899 if (
2900 #ifdef Py_UNICODE_WIDE
2901 *p > unimax || *p < 0 ||
2902 #endif
2903 end-s < Py_UNICODE_SIZE
2904 )
2905 {
2906 startinpos = s - starts;
2907 if (end-s < Py_UNICODE_SIZE) {
2908 endinpos = end-starts;
2909 reason = "truncated input";
2910 }
2911 else {
2912 endinpos = s - starts + Py_UNICODE_SIZE;
2913 reason = "illegal code point (> 0x10FFFF)";
2914 }
2915 outpos = p - PyUnicode_AS_UNICODE(v);
2916 if (unicode_decode_call_errorhandler(
2917 errors, &errorHandler,
2918 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002919 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002920 (PyObject **)&v, &outpos, &p)) {
2921 goto onError;
2922 }
2923 }
2924 else {
2925 p++;
2926 s += Py_UNICODE_SIZE;
2927 }
2928 }
2929
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002930 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002931 goto onError;
2932 Py_XDECREF(errorHandler);
2933 Py_XDECREF(exc);
2934 return (PyObject *)v;
2935
2936 onError:
2937 Py_XDECREF(v);
2938 Py_XDECREF(errorHandler);
2939 Py_XDECREF(exc);
2940 return NULL;
2941}
2942
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943/* --- Latin-1 Codec ------------------------------------------------------ */
2944
2945PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002946 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002947 const char *errors)
2948{
2949 PyUnicodeObject *v;
2950 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002951
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002953 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002954 Py_UNICODE r = *(unsigned char*)s;
2955 return PyUnicode_FromUnicode(&r, 1);
2956 }
2957
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958 v = _PyUnicode_New(size);
2959 if (v == NULL)
2960 goto onError;
2961 if (size == 0)
2962 return (PyObject *)v;
2963 p = PyUnicode_AS_UNICODE(v);
2964 while (size-- > 0)
2965 *p++ = (unsigned char)*s++;
2966 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002967
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968 onError:
2969 Py_XDECREF(v);
2970 return NULL;
2971}
2972
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002973/* create or adjust a UnicodeEncodeError */
2974static void make_encode_exception(PyObject **exceptionObject,
2975 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002976 const Py_UNICODE *unicode, Py_ssize_t size,
2977 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002978 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002980 if (*exceptionObject == NULL) {
2981 *exceptionObject = PyUnicodeEncodeError_Create(
2982 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983 }
2984 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002985 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2986 goto onError;
2987 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2988 goto onError;
2989 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2990 goto onError;
2991 return;
2992 onError:
2993 Py_DECREF(*exceptionObject);
2994 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002995 }
2996}
2997
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002998/* raises a UnicodeEncodeError */
2999static void raise_encode_exception(PyObject **exceptionObject,
3000 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003001 const Py_UNICODE *unicode, Py_ssize_t size,
3002 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003003 const char *reason)
3004{
3005 make_encode_exception(exceptionObject,
3006 encoding, unicode, size, startpos, endpos, reason);
3007 if (*exceptionObject != NULL)
3008 PyCodec_StrictErrors(*exceptionObject);
3009}
3010
3011/* error handling callback helper:
3012 build arguments, call the callback and check the arguments,
3013 put the result into newpos and return the replacement string, which
3014 has to be freed by the caller */
3015static PyObject *unicode_encode_call_errorhandler(const char *errors,
3016 PyObject **errorHandler,
3017 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003018 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3019 Py_ssize_t startpos, Py_ssize_t endpos,
3020 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003021{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003022 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003023
3024 PyObject *restuple;
3025 PyObject *resunicode;
3026
3027 if (*errorHandler == NULL) {
3028 *errorHandler = PyCodec_LookupError(errors);
3029 if (*errorHandler == NULL)
3030 return NULL;
3031 }
3032
3033 make_encode_exception(exceptionObject,
3034 encoding, unicode, size, startpos, endpos, reason);
3035 if (*exceptionObject == NULL)
3036 return NULL;
3037
3038 restuple = PyObject_CallFunctionObjArgs(
3039 *errorHandler, *exceptionObject, NULL);
3040 if (restuple == NULL)
3041 return NULL;
3042 if (!PyTuple_Check(restuple)) {
3043 PyErr_Format(PyExc_TypeError, &argparse[4]);
3044 Py_DECREF(restuple);
3045 return NULL;
3046 }
3047 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3048 &resunicode, newpos)) {
3049 Py_DECREF(restuple);
3050 return NULL;
3051 }
3052 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003053 *newpos = size+*newpos;
3054 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003055 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003056 Py_DECREF(restuple);
3057 return NULL;
3058 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 Py_INCREF(resunicode);
3060 Py_DECREF(restuple);
3061 return resunicode;
3062}
3063
3064static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003065 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003066 const char *errors,
3067 int limit)
3068{
3069 /* output object */
3070 PyObject *res;
3071 /* pointers to the beginning and end+1 of input */
3072 const Py_UNICODE *startp = p;
3073 const Py_UNICODE *endp = p + size;
3074 /* pointer to the beginning of the unencodable characters */
3075 /* const Py_UNICODE *badp = NULL; */
3076 /* pointer into the output */
3077 char *str;
3078 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003079 Py_ssize_t respos = 0;
3080 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003081 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3082 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003083 PyObject *errorHandler = NULL;
3084 PyObject *exc = NULL;
3085 /* the following variable is used for caching string comparisons
3086 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3087 int known_errorHandler = -1;
3088
3089 /* allocate enough for a simple encoding without
3090 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003091 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003092 if (res == NULL)
3093 goto onError;
3094 if (size == 0)
3095 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003096 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003097 ressize = size;
3098
3099 while (p<endp) {
3100 Py_UNICODE c = *p;
3101
3102 /* can we encode this? */
3103 if (c<limit) {
3104 /* no overflow check, because we know that the space is enough */
3105 *str++ = (char)c;
3106 ++p;
3107 }
3108 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003109 Py_ssize_t unicodepos = p-startp;
3110 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003111 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003112 Py_ssize_t repsize;
3113 Py_ssize_t newpos;
3114 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003115 Py_UNICODE *uni2;
3116 /* startpos for collecting unencodable chars */
3117 const Py_UNICODE *collstart = p;
3118 const Py_UNICODE *collend = p;
3119 /* find all unecodable characters */
3120 while ((collend < endp) && ((*collend)>=limit))
3121 ++collend;
3122 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3123 if (known_errorHandler==-1) {
3124 if ((errors==NULL) || (!strcmp(errors, "strict")))
3125 known_errorHandler = 1;
3126 else if (!strcmp(errors, "replace"))
3127 known_errorHandler = 2;
3128 else if (!strcmp(errors, "ignore"))
3129 known_errorHandler = 3;
3130 else if (!strcmp(errors, "xmlcharrefreplace"))
3131 known_errorHandler = 4;
3132 else
3133 known_errorHandler = 0;
3134 }
3135 switch (known_errorHandler) {
3136 case 1: /* strict */
3137 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3138 goto onError;
3139 case 2: /* replace */
3140 while (collstart++<collend)
3141 *str++ = '?'; /* fall through */
3142 case 3: /* ignore */
3143 p = collend;
3144 break;
3145 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003146 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003147 /* determine replacement size (temporarily (mis)uses p) */
3148 for (p = collstart, repsize = 0; p < collend; ++p) {
3149 if (*p<10)
3150 repsize += 2+1+1;
3151 else if (*p<100)
3152 repsize += 2+2+1;
3153 else if (*p<1000)
3154 repsize += 2+3+1;
3155 else if (*p<10000)
3156 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003157#ifndef Py_UNICODE_WIDE
3158 else
3159 repsize += 2+5+1;
3160#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003161 else if (*p<100000)
3162 repsize += 2+5+1;
3163 else if (*p<1000000)
3164 repsize += 2+6+1;
3165 else
3166 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003167#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003168 }
3169 requiredsize = respos+repsize+(endp-collend);
3170 if (requiredsize > ressize) {
3171 if (requiredsize<2*ressize)
3172 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003173 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003174 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003175 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003176 ressize = requiredsize;
3177 }
3178 /* generate replacement (temporarily (mis)uses p) */
3179 for (p = collstart; p < collend; ++p) {
3180 str += sprintf(str, "&#%d;", (int)*p);
3181 }
3182 p = collend;
3183 break;
3184 default:
3185 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3186 encoding, reason, startp, size, &exc,
3187 collstart-startp, collend-startp, &newpos);
3188 if (repunicode == NULL)
3189 goto onError;
3190 /* need more space? (at least enough for what we
3191 have+the replacement+the rest of the string, so
3192 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003193 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003194 repsize = PyUnicode_GET_SIZE(repunicode);
3195 requiredsize = respos+repsize+(endp-collend);
3196 if (requiredsize > ressize) {
3197 if (requiredsize<2*ressize)
3198 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003199 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003200 Py_DECREF(repunicode);
3201 goto onError;
3202 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003203 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003204 ressize = requiredsize;
3205 }
3206 /* check if there is anything unencodable in the replacement
3207 and copy it to the output */
3208 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3209 c = *uni2;
3210 if (c >= limit) {
3211 raise_encode_exception(&exc, encoding, startp, size,
3212 unicodepos, unicodepos+1, reason);
3213 Py_DECREF(repunicode);
3214 goto onError;
3215 }
3216 *str = (char)c;
3217 }
3218 p = startp + newpos;
3219 Py_DECREF(repunicode);
3220 }
3221 }
3222 }
3223 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003224 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003225 if (respos<ressize)
3226 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003227 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003228 Py_XDECREF(errorHandler);
3229 Py_XDECREF(exc);
3230 return res;
3231
3232 onError:
3233 Py_XDECREF(res);
3234 Py_XDECREF(errorHandler);
3235 Py_XDECREF(exc);
3236 return NULL;
3237}
3238
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003240 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 const char *errors)
3242{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003243 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244}
3245
3246PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3247{
3248 if (!PyUnicode_Check(unicode)) {
3249 PyErr_BadArgument();
3250 return NULL;
3251 }
3252 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3253 PyUnicode_GET_SIZE(unicode),
3254 NULL);
3255}
3256
3257/* --- 7-bit ASCII Codec -------------------------------------------------- */
3258
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003260 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 const char *errors)
3262{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003263 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 PyUnicodeObject *v;
3265 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003266 Py_ssize_t startinpos;
3267 Py_ssize_t endinpos;
3268 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003269 const char *e;
3270 PyObject *errorHandler = NULL;
3271 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003272
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003274 if (size == 1 && *(unsigned char*)s < 128) {
3275 Py_UNICODE r = *(unsigned char*)s;
3276 return PyUnicode_FromUnicode(&r, 1);
3277 }
Tim Petersced69f82003-09-16 20:30:58 +00003278
Guido van Rossumd57fd912000-03-10 22:53:23 +00003279 v = _PyUnicode_New(size);
3280 if (v == NULL)
3281 goto onError;
3282 if (size == 0)
3283 return (PyObject *)v;
3284 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003285 e = s + size;
3286 while (s < e) {
3287 register unsigned char c = (unsigned char)*s;
3288 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003289 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003290 ++s;
3291 }
3292 else {
3293 startinpos = s-starts;
3294 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003295 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296 if (unicode_decode_call_errorhandler(
3297 errors, &errorHandler,
3298 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003299 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003300 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003304 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003305 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003306 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003307 Py_XDECREF(errorHandler);
3308 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003310
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311 onError:
3312 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003313 Py_XDECREF(errorHandler);
3314 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 return NULL;
3316}
3317
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003319 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320 const char *errors)
3321{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003322 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323}
3324
3325PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3326{
3327 if (!PyUnicode_Check(unicode)) {
3328 PyErr_BadArgument();
3329 return NULL;
3330 }
3331 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3332 PyUnicode_GET_SIZE(unicode),
3333 NULL);
3334}
3335
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003336#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003337
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003338/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003339
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003340#if SIZEOF_INT < SIZEOF_SSIZE_T
3341#define NEED_RETRY
3342#endif
3343
3344/* XXX This code is limited to "true" double-byte encodings, as
3345 a) it assumes an incomplete character consists of a single byte, and
3346 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3347 encodings, see IsDBCSLeadByteEx documentation. */
3348
3349static int is_dbcs_lead_byte(const char *s, int offset)
3350{
3351 const char *curr = s + offset;
3352
3353 if (IsDBCSLeadByte(*curr)) {
3354 const char *prev = CharPrev(s, curr);
3355 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3356 }
3357 return 0;
3358}
3359
3360/*
3361 * Decode MBCS string into unicode object. If 'final' is set, converts
3362 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3363 */
3364static int decode_mbcs(PyUnicodeObject **v,
3365 const char *s, /* MBCS string */
3366 int size, /* sizeof MBCS string */
3367 int final)
3368{
3369 Py_UNICODE *p;
3370 Py_ssize_t n = 0;
3371 int usize = 0;
3372
3373 assert(size >= 0);
3374
3375 /* Skip trailing lead-byte unless 'final' is set */
3376 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3377 --size;
3378
3379 /* First get the size of the result */
3380 if (size > 0) {
3381 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3382 if (usize == 0) {
3383 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3384 return -1;
3385 }
3386 }
3387
3388 if (*v == NULL) {
3389 /* Create unicode object */
3390 *v = _PyUnicode_New(usize);
3391 if (*v == NULL)
3392 return -1;
3393 }
3394 else {
3395 /* Extend unicode object */
3396 n = PyUnicode_GET_SIZE(*v);
3397 if (_PyUnicode_Resize(v, n + usize) < 0)
3398 return -1;
3399 }
3400
3401 /* Do the conversion */
3402 if (size > 0) {
3403 p = PyUnicode_AS_UNICODE(*v) + n;
3404 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3405 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3406 return -1;
3407 }
3408 }
3409
3410 return size;
3411}
3412
3413PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3414 Py_ssize_t size,
3415 const char *errors,
3416 Py_ssize_t *consumed)
3417{
3418 PyUnicodeObject *v = NULL;
3419 int done;
3420
3421 if (consumed)
3422 *consumed = 0;
3423
3424#ifdef NEED_RETRY
3425 retry:
3426 if (size > INT_MAX)
3427 done = decode_mbcs(&v, s, INT_MAX, 0);
3428 else
3429#endif
3430 done = decode_mbcs(&v, s, (int)size, !consumed);
3431
3432 if (done < 0) {
3433 Py_XDECREF(v);
3434 return NULL;
3435 }
3436
3437 if (consumed)
3438 *consumed += done;
3439
3440#ifdef NEED_RETRY
3441 if (size > INT_MAX) {
3442 s += done;
3443 size -= done;
3444 goto retry;
3445 }
3446#endif
3447
3448 return (PyObject *)v;
3449}
3450
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003451PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003452 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003453 const char *errors)
3454{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003455 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3456}
3457
3458/*
3459 * Convert unicode into string object (MBCS).
3460 * Returns 0 if succeed, -1 otherwise.
3461 */
3462static int encode_mbcs(PyObject **repr,
3463 const Py_UNICODE *p, /* unicode */
3464 int size) /* size of unicode */
3465{
3466 int mbcssize = 0;
3467 Py_ssize_t n = 0;
3468
3469 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003470
3471 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003472 if (size > 0) {
3473 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3474 if (mbcssize == 0) {
3475 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3476 return -1;
3477 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003478 }
3479
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003480 if (*repr == NULL) {
3481 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003482 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003483 if (*repr == NULL)
3484 return -1;
3485 }
3486 else {
3487 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003488 n = PyBytes_Size(*repr);
3489 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003490 return -1;
3491 }
3492
3493 /* Do the conversion */
3494 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003495 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003496 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3497 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3498 return -1;
3499 }
3500 }
3501
3502 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003503}
3504
3505PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003506 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003507 const char *errors)
3508{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003509 PyObject *repr = NULL;
3510 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003511
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003512#ifdef NEED_RETRY
3513 retry:
3514 if (size > INT_MAX)
3515 ret = encode_mbcs(&repr, p, INT_MAX);
3516 else
3517#endif
3518 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003519
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003520 if (ret < 0) {
3521 Py_XDECREF(repr);
3522 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003523 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003524
3525#ifdef NEED_RETRY
3526 if (size > INT_MAX) {
3527 p += INT_MAX;
3528 size -= INT_MAX;
3529 goto retry;
3530 }
3531#endif
3532
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003533 return repr;
3534}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003535
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003536PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3537{
3538 if (!PyUnicode_Check(unicode)) {
3539 PyErr_BadArgument();
3540 return NULL;
3541 }
3542 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3543 PyUnicode_GET_SIZE(unicode),
3544 NULL);
3545}
3546
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003547#undef NEED_RETRY
3548
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003549#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003550
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551/* --- Character Mapping Codec -------------------------------------------- */
3552
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003554 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555 PyObject *mapping,
3556 const char *errors)
3557{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003559 Py_ssize_t startinpos;
3560 Py_ssize_t endinpos;
3561 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 PyUnicodeObject *v;
3564 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003565 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566 PyObject *errorHandler = NULL;
3567 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003568 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003569 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003570
Guido van Rossumd57fd912000-03-10 22:53:23 +00003571 /* Default to Latin-1 */
3572 if (mapping == NULL)
3573 return PyUnicode_DecodeLatin1(s, size, errors);
3574
3575 v = _PyUnicode_New(size);
3576 if (v == NULL)
3577 goto onError;
3578 if (size == 0)
3579 return (PyObject *)v;
3580 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003582 if (PyUnicode_CheckExact(mapping)) {
3583 mapstring = PyUnicode_AS_UNICODE(mapping);
3584 maplen = PyUnicode_GET_SIZE(mapping);
3585 while (s < e) {
3586 unsigned char ch = *s;
3587 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003588
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003589 if (ch < maplen)
3590 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003591
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003592 if (x == 0xfffe) {
3593 /* undefined mapping */
3594 outpos = p-PyUnicode_AS_UNICODE(v);
3595 startinpos = s-starts;
3596 endinpos = startinpos+1;
3597 if (unicode_decode_call_errorhandler(
3598 errors, &errorHandler,
3599 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003600 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003601 (PyObject **)&v, &outpos, &p)) {
3602 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003603 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003604 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003605 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003606 *p++ = x;
3607 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003609 }
3610 else {
3611 while (s < e) {
3612 unsigned char ch = *s;
3613 PyObject *w, *x;
3614
3615 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3616 w = PyInt_FromLong((long)ch);
3617 if (w == NULL)
3618 goto onError;
3619 x = PyObject_GetItem(mapping, w);
3620 Py_DECREF(w);
3621 if (x == NULL) {
3622 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3623 /* No mapping found means: mapping is undefined. */
3624 PyErr_Clear();
3625 x = Py_None;
3626 Py_INCREF(x);
3627 } else
3628 goto onError;
3629 }
3630
3631 /* Apply mapping */
3632 if (PyInt_Check(x)) {
3633 long value = PyInt_AS_LONG(x);
3634 if (value < 0 || value > 65535) {
3635 PyErr_SetString(PyExc_TypeError,
3636 "character mapping must be in range(65536)");
3637 Py_DECREF(x);
3638 goto onError;
3639 }
3640 *p++ = (Py_UNICODE)value;
3641 }
3642 else if (x == Py_None) {
3643 /* undefined mapping */
3644 outpos = p-PyUnicode_AS_UNICODE(v);
3645 startinpos = s-starts;
3646 endinpos = startinpos+1;
3647 if (unicode_decode_call_errorhandler(
3648 errors, &errorHandler,
3649 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003650 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003651 (PyObject **)&v, &outpos, &p)) {
3652 Py_DECREF(x);
3653 goto onError;
3654 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003655 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003656 continue;
3657 }
3658 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003659 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003660
3661 if (targetsize == 1)
3662 /* 1-1 mapping */
3663 *p++ = *PyUnicode_AS_UNICODE(x);
3664
3665 else if (targetsize > 1) {
3666 /* 1-n mapping */
3667 if (targetsize > extrachars) {
3668 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003669 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3670 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003671 (targetsize << 2);
3672 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003673 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003674 if (_PyUnicode_Resize(&v,
3675 PyUnicode_GET_SIZE(v) + needed) < 0) {
3676 Py_DECREF(x);
3677 goto onError;
3678 }
3679 p = PyUnicode_AS_UNICODE(v) + oldpos;
3680 }
3681 Py_UNICODE_COPY(p,
3682 PyUnicode_AS_UNICODE(x),
3683 targetsize);
3684 p += targetsize;
3685 extrachars -= targetsize;
3686 }
3687 /* 1-0 mapping: skip the character */
3688 }
3689 else {
3690 /* wrong return value */
3691 PyErr_SetString(PyExc_TypeError,
3692 "character mapping must return integer, None or unicode");
3693 Py_DECREF(x);
3694 goto onError;
3695 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003697 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699 }
3700 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003701 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003703 Py_XDECREF(errorHandler);
3704 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003706
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003708 Py_XDECREF(errorHandler);
3709 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710 Py_XDECREF(v);
3711 return NULL;
3712}
3713
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003714/* Charmap encoding: the lookup table */
3715
3716struct encoding_map{
3717 PyObject_HEAD
3718 unsigned char level1[32];
3719 int count2, count3;
3720 unsigned char level23[1];
3721};
3722
3723static PyObject*
3724encoding_map_size(PyObject *obj, PyObject* args)
3725{
3726 struct encoding_map *map = (struct encoding_map*)obj;
3727 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3728 128*map->count3);
3729}
3730
3731static PyMethodDef encoding_map_methods[] = {
3732 {"size", encoding_map_size, METH_NOARGS,
3733 PyDoc_STR("Return the size (in bytes) of this object") },
3734 { 0 }
3735};
3736
3737static void
3738encoding_map_dealloc(PyObject* o)
3739{
3740 PyObject_FREE(o);
3741}
3742
3743static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003744 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003745 "EncodingMap", /*tp_name*/
3746 sizeof(struct encoding_map), /*tp_basicsize*/
3747 0, /*tp_itemsize*/
3748 /* methods */
3749 encoding_map_dealloc, /*tp_dealloc*/
3750 0, /*tp_print*/
3751 0, /*tp_getattr*/
3752 0, /*tp_setattr*/
3753 0, /*tp_compare*/
3754 0, /*tp_repr*/
3755 0, /*tp_as_number*/
3756 0, /*tp_as_sequence*/
3757 0, /*tp_as_mapping*/
3758 0, /*tp_hash*/
3759 0, /*tp_call*/
3760 0, /*tp_str*/
3761 0, /*tp_getattro*/
3762 0, /*tp_setattro*/
3763 0, /*tp_as_buffer*/
3764 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3765 0, /*tp_doc*/
3766 0, /*tp_traverse*/
3767 0, /*tp_clear*/
3768 0, /*tp_richcompare*/
3769 0, /*tp_weaklistoffset*/
3770 0, /*tp_iter*/
3771 0, /*tp_iternext*/
3772 encoding_map_methods, /*tp_methods*/
3773 0, /*tp_members*/
3774 0, /*tp_getset*/
3775 0, /*tp_base*/
3776 0, /*tp_dict*/
3777 0, /*tp_descr_get*/
3778 0, /*tp_descr_set*/
3779 0, /*tp_dictoffset*/
3780 0, /*tp_init*/
3781 0, /*tp_alloc*/
3782 0, /*tp_new*/
3783 0, /*tp_free*/
3784 0, /*tp_is_gc*/
3785};
3786
3787PyObject*
3788PyUnicode_BuildEncodingMap(PyObject* string)
3789{
3790 Py_UNICODE *decode;
3791 PyObject *result;
3792 struct encoding_map *mresult;
3793 int i;
3794 int need_dict = 0;
3795 unsigned char level1[32];
3796 unsigned char level2[512];
3797 unsigned char *mlevel1, *mlevel2, *mlevel3;
3798 int count2 = 0, count3 = 0;
3799
3800 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3801 PyErr_BadArgument();
3802 return NULL;
3803 }
3804 decode = PyUnicode_AS_UNICODE(string);
3805 memset(level1, 0xFF, sizeof level1);
3806 memset(level2, 0xFF, sizeof level2);
3807
3808 /* If there isn't a one-to-one mapping of NULL to \0,
3809 or if there are non-BMP characters, we need to use
3810 a mapping dictionary. */
3811 if (decode[0] != 0)
3812 need_dict = 1;
3813 for (i = 1; i < 256; i++) {
3814 int l1, l2;
3815 if (decode[i] == 0
3816 #ifdef Py_UNICODE_WIDE
3817 || decode[i] > 0xFFFF
3818 #endif
3819 ) {
3820 need_dict = 1;
3821 break;
3822 }
3823 if (decode[i] == 0xFFFE)
3824 /* unmapped character */
3825 continue;
3826 l1 = decode[i] >> 11;
3827 l2 = decode[i] >> 7;
3828 if (level1[l1] == 0xFF)
3829 level1[l1] = count2++;
3830 if (level2[l2] == 0xFF)
3831 level2[l2] = count3++;
3832 }
3833
3834 if (count2 >= 0xFF || count3 >= 0xFF)
3835 need_dict = 1;
3836
3837 if (need_dict) {
3838 PyObject *result = PyDict_New();
3839 PyObject *key, *value;
3840 if (!result)
3841 return NULL;
3842 for (i = 0; i < 256; i++) {
3843 key = value = NULL;
3844 key = PyInt_FromLong(decode[i]);
3845 value = PyInt_FromLong(i);
3846 if (!key || !value)
3847 goto failed1;
3848 if (PyDict_SetItem(result, key, value) == -1)
3849 goto failed1;
3850 Py_DECREF(key);
3851 Py_DECREF(value);
3852 }
3853 return result;
3854 failed1:
3855 Py_XDECREF(key);
3856 Py_XDECREF(value);
3857 Py_DECREF(result);
3858 return NULL;
3859 }
3860
3861 /* Create a three-level trie */
3862 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3863 16*count2 + 128*count3 - 1);
3864 if (!result)
3865 return PyErr_NoMemory();
3866 PyObject_Init(result, &EncodingMapType);
3867 mresult = (struct encoding_map*)result;
3868 mresult->count2 = count2;
3869 mresult->count3 = count3;
3870 mlevel1 = mresult->level1;
3871 mlevel2 = mresult->level23;
3872 mlevel3 = mresult->level23 + 16*count2;
3873 memcpy(mlevel1, level1, 32);
3874 memset(mlevel2, 0xFF, 16*count2);
3875 memset(mlevel3, 0, 128*count3);
3876 count3 = 0;
3877 for (i = 1; i < 256; i++) {
3878 int o1, o2, o3, i2, i3;
3879 if (decode[i] == 0xFFFE)
3880 /* unmapped character */
3881 continue;
3882 o1 = decode[i]>>11;
3883 o2 = (decode[i]>>7) & 0xF;
3884 i2 = 16*mlevel1[o1] + o2;
3885 if (mlevel2[i2] == 0xFF)
3886 mlevel2[i2] = count3++;
3887 o3 = decode[i] & 0x7F;
3888 i3 = 128*mlevel2[i2] + o3;
3889 mlevel3[i3] = i;
3890 }
3891 return result;
3892}
3893
3894static int
3895encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3896{
3897 struct encoding_map *map = (struct encoding_map*)mapping;
3898 int l1 = c>>11;
3899 int l2 = (c>>7) & 0xF;
3900 int l3 = c & 0x7F;
3901 int i;
3902
3903#ifdef Py_UNICODE_WIDE
3904 if (c > 0xFFFF) {
3905 return -1;
3906 }
3907#endif
3908 if (c == 0)
3909 return 0;
3910 /* level 1*/
3911 i = map->level1[l1];
3912 if (i == 0xFF) {
3913 return -1;
3914 }
3915 /* level 2*/
3916 i = map->level23[16*i+l2];
3917 if (i == 0xFF) {
3918 return -1;
3919 }
3920 /* level 3 */
3921 i = map->level23[16*map->count2 + 128*i + l3];
3922 if (i == 0) {
3923 return -1;
3924 }
3925 return i;
3926}
3927
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003928/* Lookup the character ch in the mapping. If the character
3929 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003930 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003932{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003933 PyObject *w = PyInt_FromLong((long)c);
3934 PyObject *x;
3935
3936 if (w == NULL)
3937 return NULL;
3938 x = PyObject_GetItem(mapping, w);
3939 Py_DECREF(w);
3940 if (x == NULL) {
3941 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3942 /* No mapping found means: mapping is undefined. */
3943 PyErr_Clear();
3944 x = Py_None;
3945 Py_INCREF(x);
3946 return x;
3947 } else
3948 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003949 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003950 else if (x == Py_None)
3951 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952 else if (PyInt_Check(x)) {
3953 long value = PyInt_AS_LONG(x);
3954 if (value < 0 || value > 255) {
3955 PyErr_SetString(PyExc_TypeError,
3956 "character mapping must be in range(256)");
3957 Py_DECREF(x);
3958 return NULL;
3959 }
3960 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962 else if (PyString_Check(x))
3963 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003965 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003966 PyErr_Format(PyExc_TypeError,
3967 "character mapping must return integer, None or str8, not %.400s",
3968 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003969 Py_DECREF(x);
3970 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971 }
3972}
3973
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003974static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003975charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003976{
Walter Dörwald827b0552007-05-12 13:23:53 +00003977 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003978 /* exponentially overallocate to minimize reallocations */
3979 if (requiredsize < 2*outsize)
3980 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003981 if (PyBytes_Resize(outobj, requiredsize)) {
3982 Py_DECREF(outobj);
3983 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003984 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003985 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003986}
3987
3988typedef enum charmapencode_result {
3989 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3990}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003991/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003992 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003993 space is available. Return a new reference to the object that
3994 was put in the output buffer, or Py_None, if the mapping was undefined
3995 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003996 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003997static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003998charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003999 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004001 PyObject *rep;
4002 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00004003 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004004
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004005 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004006 int res = encoding_map_lookup(c, mapping);
4007 Py_ssize_t requiredsize = *outpos+1;
4008 if (res == -1)
4009 return enc_FAILED;
4010 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004011 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004012 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004013 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004014 outstart[(*outpos)++] = (char)res;
4015 return enc_SUCCESS;
4016 }
4017
4018 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004019 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004020 return enc_EXCEPTION;
4021 else if (rep==Py_None) {
4022 Py_DECREF(rep);
4023 return enc_FAILED;
4024 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004026 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004027 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004028 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004029 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004030 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004031 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004032 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004033 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4034 }
4035 else {
4036 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004037 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4038 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004039 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004040 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004041 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004042 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004043 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004044 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045 memcpy(outstart + *outpos, repchars, repsize);
4046 *outpos += repsize;
4047 }
4048 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004049 Py_DECREF(rep);
4050 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004051}
4052
4053/* handle an error in PyUnicode_EncodeCharmap
4054 Return 0 on success, -1 on error */
4055static
4056int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004057 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004059 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004060 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061{
4062 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004063 Py_ssize_t repsize;
4064 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004065 Py_UNICODE *uni2;
4066 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004067 Py_ssize_t collstartpos = *inpos;
4068 Py_ssize_t collendpos = *inpos+1;
4069 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004070 char *encoding = "charmap";
4071 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004072 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004073
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004074 /* find all unencodable characters */
4075 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004076 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004077 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004078 int res = encoding_map_lookup(p[collendpos], mapping);
4079 if (res != -1)
4080 break;
4081 ++collendpos;
4082 continue;
4083 }
4084
4085 rep = charmapencode_lookup(p[collendpos], mapping);
4086 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004088 else if (rep!=Py_None) {
4089 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004090 break;
4091 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004092 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004093 ++collendpos;
4094 }
4095 /* cache callback name lookup
4096 * (if not done yet, i.e. it's the first error) */
4097 if (*known_errorHandler==-1) {
4098 if ((errors==NULL) || (!strcmp(errors, "strict")))
4099 *known_errorHandler = 1;
4100 else if (!strcmp(errors, "replace"))
4101 *known_errorHandler = 2;
4102 else if (!strcmp(errors, "ignore"))
4103 *known_errorHandler = 3;
4104 else if (!strcmp(errors, "xmlcharrefreplace"))
4105 *known_errorHandler = 4;
4106 else
4107 *known_errorHandler = 0;
4108 }
4109 switch (*known_errorHandler) {
4110 case 1: /* strict */
4111 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4112 return -1;
4113 case 2: /* replace */
4114 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4115 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004116 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117 return -1;
4118 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004119 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4121 return -1;
4122 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 }
4124 /* fall through */
4125 case 3: /* ignore */
4126 *inpos = collendpos;
4127 break;
4128 case 4: /* xmlcharrefreplace */
4129 /* generate replacement (temporarily (mis)uses p) */
4130 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4131 char buffer[2+29+1+1];
4132 char *cp;
4133 sprintf(buffer, "&#%d;", (int)p[collpos]);
4134 for (cp = buffer; *cp; ++cp) {
4135 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004136 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004137 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004138 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4140 return -1;
4141 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004142 }
4143 }
4144 *inpos = collendpos;
4145 break;
4146 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004147 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004148 encoding, reason, p, size, exceptionObject,
4149 collstartpos, collendpos, &newpos);
4150 if (repunicode == NULL)
4151 return -1;
4152 /* generate replacement */
4153 repsize = PyUnicode_GET_SIZE(repunicode);
4154 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4155 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004156 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157 return -1;
4158 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004159 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4162 return -1;
4163 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004164 }
4165 *inpos = newpos;
4166 Py_DECREF(repunicode);
4167 }
4168 return 0;
4169}
4170
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004172 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173 PyObject *mapping,
4174 const char *errors)
4175{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 /* output object */
4177 PyObject *res = NULL;
4178 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004179 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004180 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004181 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004182 PyObject *errorHandler = NULL;
4183 PyObject *exc = NULL;
4184 /* the following variable is used for caching string comparisons
4185 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4186 * 3=ignore, 4=xmlcharrefreplace */
4187 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188
4189 /* Default to Latin-1 */
4190 if (mapping == NULL)
4191 return PyUnicode_EncodeLatin1(p, size, errors);
4192
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 /* allocate enough for a simple encoding without
4194 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004195 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 if (res == NULL)
4197 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004198 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004200
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 while (inpos<size) {
4202 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004203 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004204 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004206 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004207 if (charmap_encoding_error(p, size, &inpos, mapping,
4208 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004209 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004210 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004211 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004212 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004213 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004214 else
4215 /* done with this character => adjust input position */
4216 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004217 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004220 if (respos<PyBytes_GET_SIZE(res)) {
4221 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004222 goto onError;
4223 }
4224 Py_XDECREF(exc);
4225 Py_XDECREF(errorHandler);
4226 return res;
4227
4228 onError:
4229 Py_XDECREF(res);
4230 Py_XDECREF(exc);
4231 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004232 return NULL;
4233}
4234
4235PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4236 PyObject *mapping)
4237{
4238 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4239 PyErr_BadArgument();
4240 return NULL;
4241 }
4242 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4243 PyUnicode_GET_SIZE(unicode),
4244 mapping,
4245 NULL);
4246}
4247
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248/* create or adjust a UnicodeTranslateError */
4249static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004250 const Py_UNICODE *unicode, Py_ssize_t size,
4251 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254 if (*exceptionObject == NULL) {
4255 *exceptionObject = PyUnicodeTranslateError_Create(
4256 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257 }
4258 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4260 goto onError;
4261 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4262 goto onError;
4263 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4264 goto onError;
4265 return;
4266 onError:
4267 Py_DECREF(*exceptionObject);
4268 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004269 }
4270}
4271
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272/* raises a UnicodeTranslateError */
4273static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004274 const Py_UNICODE *unicode, Py_ssize_t size,
4275 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004276 const char *reason)
4277{
4278 make_translate_exception(exceptionObject,
4279 unicode, size, startpos, endpos, reason);
4280 if (*exceptionObject != NULL)
4281 PyCodec_StrictErrors(*exceptionObject);
4282}
4283
4284/* error handling callback helper:
4285 build arguments, call the callback and check the arguments,
4286 put the result into newpos and return the replacement string, which
4287 has to be freed by the caller */
4288static PyObject *unicode_translate_call_errorhandler(const char *errors,
4289 PyObject **errorHandler,
4290 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004291 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4292 Py_ssize_t startpos, Py_ssize_t endpos,
4293 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004295 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004297 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004298 PyObject *restuple;
4299 PyObject *resunicode;
4300
4301 if (*errorHandler == NULL) {
4302 *errorHandler = PyCodec_LookupError(errors);
4303 if (*errorHandler == NULL)
4304 return NULL;
4305 }
4306
4307 make_translate_exception(exceptionObject,
4308 unicode, size, startpos, endpos, reason);
4309 if (*exceptionObject == NULL)
4310 return NULL;
4311
4312 restuple = PyObject_CallFunctionObjArgs(
4313 *errorHandler, *exceptionObject, NULL);
4314 if (restuple == NULL)
4315 return NULL;
4316 if (!PyTuple_Check(restuple)) {
4317 PyErr_Format(PyExc_TypeError, &argparse[4]);
4318 Py_DECREF(restuple);
4319 return NULL;
4320 }
4321 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004322 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004323 Py_DECREF(restuple);
4324 return NULL;
4325 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004326 if (i_newpos<0)
4327 *newpos = size+i_newpos;
4328 else
4329 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004330 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004331 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004332 Py_DECREF(restuple);
4333 return NULL;
4334 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004335 Py_INCREF(resunicode);
4336 Py_DECREF(restuple);
4337 return resunicode;
4338}
4339
4340/* Lookup the character ch in the mapping and put the result in result,
4341 which must be decrefed by the caller.
4342 Return 0 on success, -1 on error */
4343static
4344int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4345{
4346 PyObject *w = PyInt_FromLong((long)c);
4347 PyObject *x;
4348
4349 if (w == NULL)
4350 return -1;
4351 x = PyObject_GetItem(mapping, w);
4352 Py_DECREF(w);
4353 if (x == NULL) {
4354 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4355 /* No mapping found means: use 1:1 mapping. */
4356 PyErr_Clear();
4357 *result = NULL;
4358 return 0;
4359 } else
4360 return -1;
4361 }
4362 else if (x == Py_None) {
4363 *result = x;
4364 return 0;
4365 }
4366 else if (PyInt_Check(x)) {
4367 long value = PyInt_AS_LONG(x);
4368 long max = PyUnicode_GetMax();
4369 if (value < 0 || value > max) {
4370 PyErr_Format(PyExc_TypeError,
4371 "character mapping must be in range(0x%lx)", max+1);
4372 Py_DECREF(x);
4373 return -1;
4374 }
4375 *result = x;
4376 return 0;
4377 }
4378 else if (PyUnicode_Check(x)) {
4379 *result = x;
4380 return 0;
4381 }
4382 else {
4383 /* wrong return value */
4384 PyErr_SetString(PyExc_TypeError,
4385 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004386 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004387 return -1;
4388 }
4389}
4390/* ensure that *outobj is at least requiredsize characters long,
4391if not reallocate and adjust various state variables.
4392Return 0 on success, -1 on error */
4393static
Walter Dörwald4894c302003-10-24 14:25:28 +00004394int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004395 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004397 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004398 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004399 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004400 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004401 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004402 if (requiredsize < 2 * oldsize)
4403 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004404 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405 return -1;
4406 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 }
4408 return 0;
4409}
4410/* lookup the character, put the result in the output string and adjust
4411 various state variables. Return a new reference to the object that
4412 was put in the output buffer in *result, or Py_None, if the mapping was
4413 undefined (in which case no character was written).
4414 The called must decref result.
4415 Return 0 on success, -1 on error. */
4416static
Walter Dörwald4894c302003-10-24 14:25:28 +00004417int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004418 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004419 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004420{
Walter Dörwald4894c302003-10-24 14:25:28 +00004421 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004422 return -1;
4423 if (*res==NULL) {
4424 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004425 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426 }
4427 else if (*res==Py_None)
4428 ;
4429 else if (PyInt_Check(*res)) {
4430 /* no overflow check, because we know that the space is enough */
4431 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4432 }
4433 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004434 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 if (repsize==1) {
4436 /* no overflow check, because we know that the space is enough */
4437 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4438 }
4439 else if (repsize!=0) {
4440 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004441 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004442 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004443 repsize - 1;
4444 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004445 return -1;
4446 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4447 *outp += repsize;
4448 }
4449 }
4450 else
4451 return -1;
4452 return 0;
4453}
4454
4455PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004456 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457 PyObject *mapping,
4458 const char *errors)
4459{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004460 /* output object */
4461 PyObject *res = NULL;
4462 /* pointers to the beginning and end+1 of input */
4463 const Py_UNICODE *startp = p;
4464 const Py_UNICODE *endp = p + size;
4465 /* pointer into the output */
4466 Py_UNICODE *str;
4467 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004468 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004469 char *reason = "character maps to <undefined>";
4470 PyObject *errorHandler = NULL;
4471 PyObject *exc = NULL;
4472 /* the following variable is used for caching string comparisons
4473 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4474 * 3=ignore, 4=xmlcharrefreplace */
4475 int known_errorHandler = -1;
4476
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 if (mapping == NULL) {
4478 PyErr_BadArgument();
4479 return NULL;
4480 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481
4482 /* allocate enough for a simple 1:1 translation without
4483 replacements, if we need more, we'll resize */
4484 res = PyUnicode_FromUnicode(NULL, size);
4485 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004486 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004487 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004488 return res;
4489 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004491 while (p<endp) {
4492 /* try to encode it */
4493 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004494 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004496 goto onError;
4497 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004498 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004499 if (x!=Py_None) /* it worked => adjust input pointer */
4500 ++p;
4501 else { /* untranslatable character */
4502 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004503 Py_ssize_t repsize;
4504 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004505 Py_UNICODE *uni2;
4506 /* startpos for collecting untranslatable chars */
4507 const Py_UNICODE *collstart = p;
4508 const Py_UNICODE *collend = p+1;
4509 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 /* find all untranslatable characters */
4512 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004513 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 goto onError;
4515 Py_XDECREF(x);
4516 if (x!=Py_None)
4517 break;
4518 ++collend;
4519 }
4520 /* cache callback name lookup
4521 * (if not done yet, i.e. it's the first error) */
4522 if (known_errorHandler==-1) {
4523 if ((errors==NULL) || (!strcmp(errors, "strict")))
4524 known_errorHandler = 1;
4525 else if (!strcmp(errors, "replace"))
4526 known_errorHandler = 2;
4527 else if (!strcmp(errors, "ignore"))
4528 known_errorHandler = 3;
4529 else if (!strcmp(errors, "xmlcharrefreplace"))
4530 known_errorHandler = 4;
4531 else
4532 known_errorHandler = 0;
4533 }
4534 switch (known_errorHandler) {
4535 case 1: /* strict */
4536 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4537 goto onError;
4538 case 2: /* replace */
4539 /* No need to check for space, this is a 1:1 replacement */
4540 for (coll = collstart; coll<collend; ++coll)
4541 *str++ = '?';
4542 /* fall through */
4543 case 3: /* ignore */
4544 p = collend;
4545 break;
4546 case 4: /* xmlcharrefreplace */
4547 /* generate replacement (temporarily (mis)uses p) */
4548 for (p = collstart; p < collend; ++p) {
4549 char buffer[2+29+1+1];
4550 char *cp;
4551 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004552 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004553 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4554 goto onError;
4555 for (cp = buffer; *cp; ++cp)
4556 *str++ = *cp;
4557 }
4558 p = collend;
4559 break;
4560 default:
4561 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4562 reason, startp, size, &exc,
4563 collstart-startp, collend-startp, &newpos);
4564 if (repunicode == NULL)
4565 goto onError;
4566 /* generate replacement */
4567 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004568 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4570 Py_DECREF(repunicode);
4571 goto onError;
4572 }
4573 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4574 *str++ = *uni2;
4575 p = startp + newpos;
4576 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004577 }
4578 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580 /* Resize if we allocated to much */
4581 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004582 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004583 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004584 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585 }
4586 Py_XDECREF(exc);
4587 Py_XDECREF(errorHandler);
4588 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004589
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004590 onError:
4591 Py_XDECREF(res);
4592 Py_XDECREF(exc);
4593 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004594 return NULL;
4595}
4596
4597PyObject *PyUnicode_Translate(PyObject *str,
4598 PyObject *mapping,
4599 const char *errors)
4600{
4601 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004602
Guido van Rossumd57fd912000-03-10 22:53:23 +00004603 str = PyUnicode_FromObject(str);
4604 if (str == NULL)
4605 goto onError;
4606 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4607 PyUnicode_GET_SIZE(str),
4608 mapping,
4609 errors);
4610 Py_DECREF(str);
4611 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004612
Guido van Rossumd57fd912000-03-10 22:53:23 +00004613 onError:
4614 Py_XDECREF(str);
4615 return NULL;
4616}
Tim Petersced69f82003-09-16 20:30:58 +00004617
Guido van Rossum9e896b32000-04-05 20:11:21 +00004618/* --- Decimal Encoder ---------------------------------------------------- */
4619
4620int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004621 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004622 char *output,
4623 const char *errors)
4624{
4625 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626 PyObject *errorHandler = NULL;
4627 PyObject *exc = NULL;
4628 const char *encoding = "decimal";
4629 const char *reason = "invalid decimal Unicode string";
4630 /* the following variable is used for caching string comparisons
4631 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4632 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004633
4634 if (output == NULL) {
4635 PyErr_BadArgument();
4636 return -1;
4637 }
4638
4639 p = s;
4640 end = s + length;
4641 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004642 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004643 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004644 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004645 Py_ssize_t repsize;
4646 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004647 Py_UNICODE *uni2;
4648 Py_UNICODE *collstart;
4649 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004650
Guido van Rossum9e896b32000-04-05 20:11:21 +00004651 if (Py_UNICODE_ISSPACE(ch)) {
4652 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004653 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004654 continue;
4655 }
4656 decimal = Py_UNICODE_TODECIMAL(ch);
4657 if (decimal >= 0) {
4658 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004660 continue;
4661 }
Guido van Rossumba477042000-04-06 18:18:10 +00004662 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004663 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004664 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004665 continue;
4666 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004667 /* All other characters are considered unencodable */
4668 collstart = p;
4669 collend = p+1;
4670 while (collend < end) {
4671 if ((0 < *collend && *collend < 256) ||
4672 !Py_UNICODE_ISSPACE(*collend) ||
4673 Py_UNICODE_TODECIMAL(*collend))
4674 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004675 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004676 /* cache callback name lookup
4677 * (if not done yet, i.e. it's the first error) */
4678 if (known_errorHandler==-1) {
4679 if ((errors==NULL) || (!strcmp(errors, "strict")))
4680 known_errorHandler = 1;
4681 else if (!strcmp(errors, "replace"))
4682 known_errorHandler = 2;
4683 else if (!strcmp(errors, "ignore"))
4684 known_errorHandler = 3;
4685 else if (!strcmp(errors, "xmlcharrefreplace"))
4686 known_errorHandler = 4;
4687 else
4688 known_errorHandler = 0;
4689 }
4690 switch (known_errorHandler) {
4691 case 1: /* strict */
4692 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4693 goto onError;
4694 case 2: /* replace */
4695 for (p = collstart; p < collend; ++p)
4696 *output++ = '?';
4697 /* fall through */
4698 case 3: /* ignore */
4699 p = collend;
4700 break;
4701 case 4: /* xmlcharrefreplace */
4702 /* generate replacement (temporarily (mis)uses p) */
4703 for (p = collstart; p < collend; ++p)
4704 output += sprintf(output, "&#%d;", (int)*p);
4705 p = collend;
4706 break;
4707 default:
4708 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4709 encoding, reason, s, length, &exc,
4710 collstart-s, collend-s, &newpos);
4711 if (repunicode == NULL)
4712 goto onError;
4713 /* generate replacement */
4714 repsize = PyUnicode_GET_SIZE(repunicode);
4715 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4716 Py_UNICODE ch = *uni2;
4717 if (Py_UNICODE_ISSPACE(ch))
4718 *output++ = ' ';
4719 else {
4720 decimal = Py_UNICODE_TODECIMAL(ch);
4721 if (decimal >= 0)
4722 *output++ = '0' + decimal;
4723 else if (0 < ch && ch < 256)
4724 *output++ = (char)ch;
4725 else {
4726 Py_DECREF(repunicode);
4727 raise_encode_exception(&exc, encoding,
4728 s, length, collstart-s, collend-s, reason);
4729 goto onError;
4730 }
4731 }
4732 }
4733 p = s + newpos;
4734 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004735 }
4736 }
4737 /* 0-terminate the output string */
4738 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004739 Py_XDECREF(exc);
4740 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004741 return 0;
4742
4743 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004744 Py_XDECREF(exc);
4745 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004746 return -1;
4747}
4748
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749/* --- Helpers ------------------------------------------------------------ */
4750
Thomas Wouters477c8d52006-05-27 19:21:47 +00004751#define STRINGLIB_CHAR Py_UNICODE
4752
4753#define STRINGLIB_LEN PyUnicode_GET_SIZE
4754#define STRINGLIB_NEW PyUnicode_FromUnicode
4755#define STRINGLIB_STR PyUnicode_AS_UNICODE
4756
4757Py_LOCAL_INLINE(int)
4758STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004760 if (str[0] != other[0])
4761 return 1;
4762 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763}
4764
Thomas Wouters477c8d52006-05-27 19:21:47 +00004765#define STRINGLIB_EMPTY unicode_empty
4766
4767#include "stringlib/fastsearch.h"
4768
4769#include "stringlib/count.h"
4770#include "stringlib/find.h"
4771#include "stringlib/partition.h"
4772
4773/* helper macro to fixup start/end slice values */
4774#define FIX_START_END(obj) \
4775 if (start < 0) \
4776 start += (obj)->length; \
4777 if (start < 0) \
4778 start = 0; \
4779 if (end > (obj)->length) \
4780 end = (obj)->length; \
4781 if (end < 0) \
4782 end += (obj)->length; \
4783 if (end < 0) \
4784 end = 0;
4785
Martin v. Löwis18e16552006-02-15 17:27:45 +00004786Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004787 PyObject *substr,
4788 Py_ssize_t start,
4789 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004791 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004792 PyUnicodeObject* str_obj;
4793 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004794
Thomas Wouters477c8d52006-05-27 19:21:47 +00004795 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4796 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004798 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4799 if (!sub_obj) {
4800 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801 return -1;
4802 }
Tim Petersced69f82003-09-16 20:30:58 +00004803
Thomas Wouters477c8d52006-05-27 19:21:47 +00004804 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004805
Thomas Wouters477c8d52006-05-27 19:21:47 +00004806 result = stringlib_count(
4807 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4808 );
4809
4810 Py_DECREF(sub_obj);
4811 Py_DECREF(str_obj);
4812
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813 return result;
4814}
4815
Martin v. Löwis18e16552006-02-15 17:27:45 +00004816Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004817 PyObject *sub,
4818 Py_ssize_t start,
4819 Py_ssize_t end,
4820 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004822 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004823
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004825 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004826 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004827 sub = PyUnicode_FromObject(sub);
4828 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004829 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004830 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 }
Tim Petersced69f82003-09-16 20:30:58 +00004832
Thomas Wouters477c8d52006-05-27 19:21:47 +00004833 if (direction > 0)
4834 result = stringlib_find_slice(
4835 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4836 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4837 start, end
4838 );
4839 else
4840 result = stringlib_rfind_slice(
4841 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4842 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4843 start, end
4844 );
4845
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004847 Py_DECREF(sub);
4848
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849 return result;
4850}
4851
Tim Petersced69f82003-09-16 20:30:58 +00004852static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853int tailmatch(PyUnicodeObject *self,
4854 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004855 Py_ssize_t start,
4856 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 int direction)
4858{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859 if (substring->length == 0)
4860 return 1;
4861
Thomas Wouters477c8d52006-05-27 19:21:47 +00004862 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863
4864 end -= substring->length;
4865 if (end < start)
4866 return 0;
4867
4868 if (direction > 0) {
4869 if (Py_UNICODE_MATCH(self, end, substring))
4870 return 1;
4871 } else {
4872 if (Py_UNICODE_MATCH(self, start, substring))
4873 return 1;
4874 }
4875
4876 return 0;
4877}
4878
Martin v. Löwis18e16552006-02-15 17:27:45 +00004879Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004881 Py_ssize_t start,
4882 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883 int direction)
4884{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004885 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004886
Guido van Rossumd57fd912000-03-10 22:53:23 +00004887 str = PyUnicode_FromObject(str);
4888 if (str == NULL)
4889 return -1;
4890 substr = PyUnicode_FromObject(substr);
4891 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004892 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893 return -1;
4894 }
Tim Petersced69f82003-09-16 20:30:58 +00004895
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896 result = tailmatch((PyUnicodeObject *)str,
4897 (PyUnicodeObject *)substr,
4898 start, end, direction);
4899 Py_DECREF(str);
4900 Py_DECREF(substr);
4901 return result;
4902}
4903
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904/* Apply fixfct filter to the Unicode object self and return a
4905 reference to the modified object */
4906
Tim Petersced69f82003-09-16 20:30:58 +00004907static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908PyObject *fixup(PyUnicodeObject *self,
4909 int (*fixfct)(PyUnicodeObject *s))
4910{
4911
4912 PyUnicodeObject *u;
4913
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004914 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915 if (u == NULL)
4916 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004917
4918 Py_UNICODE_COPY(u->str, self->str, self->length);
4919
Tim Peters7a29bd52001-09-12 03:03:31 +00004920 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004921 /* fixfct should return TRUE if it modified the buffer. If
4922 FALSE, return a reference to the original buffer instead
4923 (to save space, not time) */
4924 Py_INCREF(self);
4925 Py_DECREF(u);
4926 return (PyObject*) self;
4927 }
4928 return (PyObject*) u;
4929}
4930
Tim Petersced69f82003-09-16 20:30:58 +00004931static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932int fixupper(PyUnicodeObject *self)
4933{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004934 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 Py_UNICODE *s = self->str;
4936 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004937
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 while (len-- > 0) {
4939 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004940
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941 ch = Py_UNICODE_TOUPPER(*s);
4942 if (ch != *s) {
4943 status = 1;
4944 *s = ch;
4945 }
4946 s++;
4947 }
4948
4949 return status;
4950}
4951
Tim Petersced69f82003-09-16 20:30:58 +00004952static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004953int fixlower(PyUnicodeObject *self)
4954{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004955 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956 Py_UNICODE *s = self->str;
4957 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004958
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959 while (len-- > 0) {
4960 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004961
Guido van Rossumd57fd912000-03-10 22:53:23 +00004962 ch = Py_UNICODE_TOLOWER(*s);
4963 if (ch != *s) {
4964 status = 1;
4965 *s = ch;
4966 }
4967 s++;
4968 }
4969
4970 return status;
4971}
4972
Tim Petersced69f82003-09-16 20:30:58 +00004973static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974int fixswapcase(PyUnicodeObject *self)
4975{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004976 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977 Py_UNICODE *s = self->str;
4978 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004979
Guido van Rossumd57fd912000-03-10 22:53:23 +00004980 while (len-- > 0) {
4981 if (Py_UNICODE_ISUPPER(*s)) {
4982 *s = Py_UNICODE_TOLOWER(*s);
4983 status = 1;
4984 } else if (Py_UNICODE_ISLOWER(*s)) {
4985 *s = Py_UNICODE_TOUPPER(*s);
4986 status = 1;
4987 }
4988 s++;
4989 }
4990
4991 return status;
4992}
4993
Tim Petersced69f82003-09-16 20:30:58 +00004994static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004995int fixcapitalize(PyUnicodeObject *self)
4996{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004997 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004998 Py_UNICODE *s = self->str;
4999 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005000
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005001 if (len == 0)
5002 return 0;
5003 if (Py_UNICODE_ISLOWER(*s)) {
5004 *s = Py_UNICODE_TOUPPER(*s);
5005 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005007 s++;
5008 while (--len > 0) {
5009 if (Py_UNICODE_ISUPPER(*s)) {
5010 *s = Py_UNICODE_TOLOWER(*s);
5011 status = 1;
5012 }
5013 s++;
5014 }
5015 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016}
5017
5018static
5019int fixtitle(PyUnicodeObject *self)
5020{
5021 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5022 register Py_UNICODE *e;
5023 int previous_is_cased;
5024
5025 /* Shortcut for single character strings */
5026 if (PyUnicode_GET_SIZE(self) == 1) {
5027 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5028 if (*p != ch) {
5029 *p = ch;
5030 return 1;
5031 }
5032 else
5033 return 0;
5034 }
Tim Petersced69f82003-09-16 20:30:58 +00005035
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036 e = p + PyUnicode_GET_SIZE(self);
5037 previous_is_cased = 0;
5038 for (; p < e; p++) {
5039 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005040
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041 if (previous_is_cased)
5042 *p = Py_UNICODE_TOLOWER(ch);
5043 else
5044 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005045
5046 if (Py_UNICODE_ISLOWER(ch) ||
5047 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048 Py_UNICODE_ISTITLE(ch))
5049 previous_is_cased = 1;
5050 else
5051 previous_is_cased = 0;
5052 }
5053 return 1;
5054}
5055
Tim Peters8ce9f162004-08-27 01:49:32 +00005056PyObject *
5057PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058{
Tim Peters8ce9f162004-08-27 01:49:32 +00005059 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005060 const Py_UNICODE blank = ' ';
5061 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005062 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005063 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005064 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5065 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005066 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5067 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005068 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005069 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005070 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071
Tim Peters05eba1f2004-08-27 21:32:02 +00005072 fseq = PySequence_Fast(seq, "");
5073 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005074 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005075 }
5076
Tim Peters91879ab2004-08-27 22:35:44 +00005077 /* Grrrr. A codec may be invoked to convert str objects to
5078 * Unicode, and so it's possible to call back into Python code
5079 * during PyUnicode_FromObject(), and so it's possible for a sick
5080 * codec to change the size of fseq (if seq is a list). Therefore
5081 * we have to keep refetching the size -- can't assume seqlen
5082 * is invariant.
5083 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005084 seqlen = PySequence_Fast_GET_SIZE(fseq);
5085 /* If empty sequence, return u"". */
5086 if (seqlen == 0) {
5087 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5088 goto Done;
5089 }
5090 /* If singleton sequence with an exact Unicode, return that. */
5091 if (seqlen == 1) {
5092 item = PySequence_Fast_GET_ITEM(fseq, 0);
5093 if (PyUnicode_CheckExact(item)) {
5094 Py_INCREF(item);
5095 res = (PyUnicodeObject *)item;
5096 goto Done;
5097 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005098 }
5099
Tim Peters05eba1f2004-08-27 21:32:02 +00005100 /* At least two items to join, or one that isn't exact Unicode. */
5101 if (seqlen > 1) {
5102 /* Set up sep and seplen -- they're needed. */
5103 if (separator == NULL) {
5104 sep = &blank;
5105 seplen = 1;
5106 }
5107 else {
5108 internal_separator = PyUnicode_FromObject(separator);
5109 if (internal_separator == NULL)
5110 goto onError;
5111 sep = PyUnicode_AS_UNICODE(internal_separator);
5112 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005113 /* In case PyUnicode_FromObject() mutated seq. */
5114 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005115 }
5116 }
5117
5118 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005119 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005120 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005121 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005122 res_p = PyUnicode_AS_UNICODE(res);
5123 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005124
Tim Peters05eba1f2004-08-27 21:32:02 +00005125 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005126 Py_ssize_t itemlen;
5127 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005128
5129 item = PySequence_Fast_GET_ITEM(fseq, i);
5130 /* Convert item to Unicode. */
5131 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5132 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005133 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005134 " %.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00005135 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005136 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005137 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005138 item = PyUnicode_FromObject(item);
5139 if (item == NULL)
5140 goto onError;
5141 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005142
Tim Peters91879ab2004-08-27 22:35:44 +00005143 /* In case PyUnicode_FromObject() mutated seq. */
5144 seqlen = PySequence_Fast_GET_SIZE(fseq);
5145
Tim Peters8ce9f162004-08-27 01:49:32 +00005146 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005148 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005149 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005150 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005151 if (i < seqlen - 1) {
5152 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005153 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005154 goto Overflow;
5155 }
5156 if (new_res_used > res_alloc) {
5157 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005158 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005159 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005160 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005161 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005162 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005163 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005164 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005166 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005167 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005169
5170 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005171 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005172 res_p += itemlen;
5173 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005174 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005175 res_p += seplen;
5176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005178 res_used = new_res_used;
5179 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005180
Tim Peters05eba1f2004-08-27 21:32:02 +00005181 /* Shrink res to match the used area; this probably can't fail,
5182 * but it's cheap to check.
5183 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005184 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005185 goto onError;
5186
5187 Done:
5188 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005189 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190 return (PyObject *)res;
5191
Tim Peters8ce9f162004-08-27 01:49:32 +00005192 Overflow:
5193 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005194 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005195 Py_DECREF(item);
5196 /* fall through */
5197
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005199 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005200 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005201 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 return NULL;
5203}
5204
Tim Petersced69f82003-09-16 20:30:58 +00005205static
5206PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005207 Py_ssize_t left,
5208 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209 Py_UNICODE fill)
5210{
5211 PyUnicodeObject *u;
5212
5213 if (left < 0)
5214 left = 0;
5215 if (right < 0)
5216 right = 0;
5217
Tim Peters7a29bd52001-09-12 03:03:31 +00005218 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219 Py_INCREF(self);
5220 return self;
5221 }
5222
5223 u = _PyUnicode_New(left + self->length + right);
5224 if (u) {
5225 if (left)
5226 Py_UNICODE_FILL(u->str, fill, left);
5227 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5228 if (right)
5229 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5230 }
5231
5232 return u;
5233}
5234
5235#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005236 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 if (!str) \
5238 goto onError; \
5239 if (PyList_Append(list, str)) { \
5240 Py_DECREF(str); \
5241 goto onError; \
5242 } \
5243 else \
5244 Py_DECREF(str);
5245
5246static
5247PyObject *split_whitespace(PyUnicodeObject *self,
5248 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005249 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005251 register Py_ssize_t i;
5252 register Py_ssize_t j;
5253 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254 PyObject *str;
5255
5256 for (i = j = 0; i < len; ) {
5257 /* find a token */
5258 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5259 i++;
5260 j = i;
5261 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5262 i++;
5263 if (j < i) {
5264 if (maxcount-- <= 0)
5265 break;
5266 SPLIT_APPEND(self->str, j, i);
5267 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5268 i++;
5269 j = i;
5270 }
5271 }
5272 if (j < len) {
5273 SPLIT_APPEND(self->str, j, len);
5274 }
5275 return list;
5276
5277 onError:
5278 Py_DECREF(list);
5279 return NULL;
5280}
5281
5282PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005283 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005285 register Py_ssize_t i;
5286 register Py_ssize_t j;
5287 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 PyObject *list;
5289 PyObject *str;
5290 Py_UNICODE *data;
5291
5292 string = PyUnicode_FromObject(string);
5293 if (string == NULL)
5294 return NULL;
5295 data = PyUnicode_AS_UNICODE(string);
5296 len = PyUnicode_GET_SIZE(string);
5297
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 list = PyList_New(0);
5299 if (!list)
5300 goto onError;
5301
5302 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005303 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005304
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005306 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308
5309 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005310 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311 if (i < len) {
5312 if (data[i] == '\r' && i + 1 < len &&
5313 data[i+1] == '\n')
5314 i += 2;
5315 else
5316 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005317 if (keepends)
5318 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319 }
Guido van Rossum86662912000-04-11 15:38:46 +00005320 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321 j = i;
5322 }
5323 if (j < len) {
5324 SPLIT_APPEND(data, j, len);
5325 }
5326
5327 Py_DECREF(string);
5328 return list;
5329
5330 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005331 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 Py_DECREF(string);
5333 return NULL;
5334}
5335
Tim Petersced69f82003-09-16 20:30:58 +00005336static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337PyObject *split_char(PyUnicodeObject *self,
5338 PyObject *list,
5339 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005340 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005342 register Py_ssize_t i;
5343 register Py_ssize_t j;
5344 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345 PyObject *str;
5346
5347 for (i = j = 0; i < len; ) {
5348 if (self->str[i] == ch) {
5349 if (maxcount-- <= 0)
5350 break;
5351 SPLIT_APPEND(self->str, j, i);
5352 i = j = i + 1;
5353 } else
5354 i++;
5355 }
5356 if (j <= len) {
5357 SPLIT_APPEND(self->str, j, len);
5358 }
5359 return list;
5360
5361 onError:
5362 Py_DECREF(list);
5363 return NULL;
5364}
5365
Tim Petersced69f82003-09-16 20:30:58 +00005366static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367PyObject *split_substring(PyUnicodeObject *self,
5368 PyObject *list,
5369 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005370 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005372 register Py_ssize_t i;
5373 register Py_ssize_t j;
5374 Py_ssize_t len = self->length;
5375 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376 PyObject *str;
5377
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005378 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379 if (Py_UNICODE_MATCH(self, i, substring)) {
5380 if (maxcount-- <= 0)
5381 break;
5382 SPLIT_APPEND(self->str, j, i);
5383 i = j = i + sublen;
5384 } else
5385 i++;
5386 }
5387 if (j <= len) {
5388 SPLIT_APPEND(self->str, j, len);
5389 }
5390 return list;
5391
5392 onError:
5393 Py_DECREF(list);
5394 return NULL;
5395}
5396
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005397static
5398PyObject *rsplit_whitespace(PyUnicodeObject *self,
5399 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005400 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005401{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005402 register Py_ssize_t i;
5403 register Py_ssize_t j;
5404 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005405 PyObject *str;
5406
5407 for (i = j = len - 1; i >= 0; ) {
5408 /* find a token */
5409 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5410 i--;
5411 j = i;
5412 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5413 i--;
5414 if (j > i) {
5415 if (maxcount-- <= 0)
5416 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005417 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005418 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5419 i--;
5420 j = i;
5421 }
5422 }
5423 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005424 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005425 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005426 if (PyList_Reverse(list) < 0)
5427 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005428 return list;
5429
5430 onError:
5431 Py_DECREF(list);
5432 return NULL;
5433}
5434
5435static
5436PyObject *rsplit_char(PyUnicodeObject *self,
5437 PyObject *list,
5438 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005439 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005440{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005441 register Py_ssize_t i;
5442 register Py_ssize_t j;
5443 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005444 PyObject *str;
5445
5446 for (i = j = len - 1; i >= 0; ) {
5447 if (self->str[i] == ch) {
5448 if (maxcount-- <= 0)
5449 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005450 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005451 j = i = i - 1;
5452 } else
5453 i--;
5454 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005455 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005456 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005457 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005458 if (PyList_Reverse(list) < 0)
5459 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005460 return list;
5461
5462 onError:
5463 Py_DECREF(list);
5464 return NULL;
5465}
5466
5467static
5468PyObject *rsplit_substring(PyUnicodeObject *self,
5469 PyObject *list,
5470 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005471 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005472{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005473 register Py_ssize_t i;
5474 register Py_ssize_t j;
5475 Py_ssize_t len = self->length;
5476 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005477 PyObject *str;
5478
5479 for (i = len - sublen, j = len; i >= 0; ) {
5480 if (Py_UNICODE_MATCH(self, i, substring)) {
5481 if (maxcount-- <= 0)
5482 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005483 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005484 j = i;
5485 i -= sublen;
5486 } else
5487 i--;
5488 }
5489 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005490 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005491 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005492 if (PyList_Reverse(list) < 0)
5493 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005494 return list;
5495
5496 onError:
5497 Py_DECREF(list);
5498 return NULL;
5499}
5500
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501#undef SPLIT_APPEND
5502
5503static
5504PyObject *split(PyUnicodeObject *self,
5505 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005506 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507{
5508 PyObject *list;
5509
5510 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005511 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512
5513 list = PyList_New(0);
5514 if (!list)
5515 return NULL;
5516
5517 if (substring == NULL)
5518 return split_whitespace(self,list,maxcount);
5519
5520 else if (substring->length == 1)
5521 return split_char(self,list,substring->str[0],maxcount);
5522
5523 else if (substring->length == 0) {
5524 Py_DECREF(list);
5525 PyErr_SetString(PyExc_ValueError, "empty separator");
5526 return NULL;
5527 }
5528 else
5529 return split_substring(self,list,substring,maxcount);
5530}
5531
Tim Petersced69f82003-09-16 20:30:58 +00005532static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005533PyObject *rsplit(PyUnicodeObject *self,
5534 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005535 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005536{
5537 PyObject *list;
5538
5539 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005540 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005541
5542 list = PyList_New(0);
5543 if (!list)
5544 return NULL;
5545
5546 if (substring == NULL)
5547 return rsplit_whitespace(self,list,maxcount);
5548
5549 else if (substring->length == 1)
5550 return rsplit_char(self,list,substring->str[0],maxcount);
5551
5552 else if (substring->length == 0) {
5553 Py_DECREF(list);
5554 PyErr_SetString(PyExc_ValueError, "empty separator");
5555 return NULL;
5556 }
5557 else
5558 return rsplit_substring(self,list,substring,maxcount);
5559}
5560
5561static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562PyObject *replace(PyUnicodeObject *self,
5563 PyUnicodeObject *str1,
5564 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005565 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566{
5567 PyUnicodeObject *u;
5568
5569 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005570 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571
Thomas Wouters477c8d52006-05-27 19:21:47 +00005572 if (str1->length == str2->length) {
5573 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005574 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005575 if (str1->length == 1) {
5576 /* replace characters */
5577 Py_UNICODE u1, u2;
5578 if (!findchar(self->str, self->length, str1->str[0]))
5579 goto nothing;
5580 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5581 if (!u)
5582 return NULL;
5583 Py_UNICODE_COPY(u->str, self->str, self->length);
5584 u1 = str1->str[0];
5585 u2 = str2->str[0];
5586 for (i = 0; i < u->length; i++)
5587 if (u->str[i] == u1) {
5588 if (--maxcount < 0)
5589 break;
5590 u->str[i] = u2;
5591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005593 i = fastsearch(
5594 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005596 if (i < 0)
5597 goto nothing;
5598 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5599 if (!u)
5600 return NULL;
5601 Py_UNICODE_COPY(u->str, self->str, self->length);
5602 while (i <= self->length - str1->length)
5603 if (Py_UNICODE_MATCH(self, i, str1)) {
5604 if (--maxcount < 0)
5605 break;
5606 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5607 i += str1->length;
5608 } else
5609 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005612
5613 Py_ssize_t n, i, j, e;
5614 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 Py_UNICODE *p;
5616
5617 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005618 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 if (n > maxcount)
5620 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005621 if (n == 0)
5622 goto nothing;
5623 /* new_size = self->length + n * (str2->length - str1->length)); */
5624 delta = (str2->length - str1->length);
5625 if (delta == 0) {
5626 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005628 product = n * (str2->length - str1->length);
5629 if ((product / (str2->length - str1->length)) != n) {
5630 PyErr_SetString(PyExc_OverflowError,
5631 "replace string is too long");
5632 return NULL;
5633 }
5634 new_size = self->length + product;
5635 if (new_size < 0) {
5636 PyErr_SetString(PyExc_OverflowError,
5637 "replace string is too long");
5638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639 }
5640 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005641 u = _PyUnicode_New(new_size);
5642 if (!u)
5643 return NULL;
5644 i = 0;
5645 p = u->str;
5646 e = self->length - str1->length;
5647 if (str1->length > 0) {
5648 while (n-- > 0) {
5649 /* look for next match */
5650 j = i;
5651 while (j <= e) {
5652 if (Py_UNICODE_MATCH(self, j, str1))
5653 break;
5654 j++;
5655 }
5656 if (j > i) {
5657 if (j > e)
5658 break;
5659 /* copy unchanged part [i:j] */
5660 Py_UNICODE_COPY(p, self->str+i, j-i);
5661 p += j - i;
5662 }
5663 /* copy substitution string */
5664 if (str2->length > 0) {
5665 Py_UNICODE_COPY(p, str2->str, str2->length);
5666 p += str2->length;
5667 }
5668 i = j + str1->length;
5669 }
5670 if (i < self->length)
5671 /* copy tail [i:] */
5672 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5673 } else {
5674 /* interleave */
5675 while (n > 0) {
5676 Py_UNICODE_COPY(p, str2->str, str2->length);
5677 p += str2->length;
5678 if (--n <= 0)
5679 break;
5680 *p++ = self->str[i++];
5681 }
5682 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5683 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005686
5687nothing:
5688 /* nothing to replace; return original string (when possible) */
5689 if (PyUnicode_CheckExact(self)) {
5690 Py_INCREF(self);
5691 return (PyObject *) self;
5692 }
5693 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694}
5695
5696/* --- Unicode Object Methods --------------------------------------------- */
5697
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005698PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699"S.title() -> unicode\n\
5700\n\
5701Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005702characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703
5704static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005705unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707 return fixup(self, fixtitle);
5708}
5709
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005710PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711"S.capitalize() -> unicode\n\
5712\n\
5713Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005714have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715
5716static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005717unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 return fixup(self, fixcapitalize);
5720}
5721
5722#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005723PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724"S.capwords() -> unicode\n\
5725\n\
5726Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005727normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728
5729static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005730unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731{
5732 PyObject *list;
5733 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005734 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 /* Split into words */
5737 list = split(self, NULL, -1);
5738 if (!list)
5739 return NULL;
5740
5741 /* Capitalize each word */
5742 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5743 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5744 fixcapitalize);
5745 if (item == NULL)
5746 goto onError;
5747 Py_DECREF(PyList_GET_ITEM(list, i));
5748 PyList_SET_ITEM(list, i, item);
5749 }
5750
5751 /* Join the words to form a new string */
5752 item = PyUnicode_Join(NULL, list);
5753
5754onError:
5755 Py_DECREF(list);
5756 return (PyObject *)item;
5757}
5758#endif
5759
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005760/* Argument converter. Coerces to a single unicode character */
5761
5762static int
5763convert_uc(PyObject *obj, void *addr)
5764{
5765 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5766 PyObject *uniobj;
5767 Py_UNICODE *unistr;
5768
5769 uniobj = PyUnicode_FromObject(obj);
5770 if (uniobj == NULL) {
5771 PyErr_SetString(PyExc_TypeError,
5772 "The fill character cannot be converted to Unicode");
5773 return 0;
5774 }
5775 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5776 PyErr_SetString(PyExc_TypeError,
5777 "The fill character must be exactly one character long");
5778 Py_DECREF(uniobj);
5779 return 0;
5780 }
5781 unistr = PyUnicode_AS_UNICODE(uniobj);
5782 *fillcharloc = unistr[0];
5783 Py_DECREF(uniobj);
5784 return 1;
5785}
5786
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005787PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005788"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005790Return S centered in a Unicode string of length width. Padding is\n\
5791done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792
5793static PyObject *
5794unicode_center(PyUnicodeObject *self, PyObject *args)
5795{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005796 Py_ssize_t marg, left;
5797 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005798 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799
Thomas Woutersde017742006-02-16 19:34:37 +00005800 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801 return NULL;
5802
Tim Peters7a29bd52001-09-12 03:03:31 +00005803 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804 Py_INCREF(self);
5805 return (PyObject*) self;
5806 }
5807
5808 marg = width - self->length;
5809 left = marg / 2 + (marg & width & 1);
5810
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005811 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812}
5813
Marc-André Lemburge5034372000-08-08 08:04:29 +00005814#if 0
5815
5816/* This code should go into some future Unicode collation support
5817 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005818 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005819
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005820/* speedy UTF-16 code point order comparison */
5821/* gleaned from: */
5822/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5823
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005824static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005825{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005826 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005827 0, 0, 0, 0, 0, 0, 0, 0,
5828 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005829 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005830};
5831
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832static int
5833unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5834{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005835 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005836
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837 Py_UNICODE *s1 = str1->str;
5838 Py_UNICODE *s2 = str2->str;
5839
5840 len1 = str1->length;
5841 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005842
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005844 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005845
5846 c1 = *s1++;
5847 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005848
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005849 if (c1 > (1<<11) * 26)
5850 c1 += utf16Fixup[c1>>11];
5851 if (c2 > (1<<11) * 26)
5852 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005853 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005854
5855 if (c1 != c2)
5856 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005857
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005858 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 }
5860
5861 return (len1 < len2) ? -1 : (len1 != len2);
5862}
5863
Marc-André Lemburge5034372000-08-08 08:04:29 +00005864#else
5865
5866static int
5867unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5868{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005869 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005870
5871 Py_UNICODE *s1 = str1->str;
5872 Py_UNICODE *s2 = str2->str;
5873
5874 len1 = str1->length;
5875 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005876
Marc-André Lemburge5034372000-08-08 08:04:29 +00005877 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005878 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005879
Fredrik Lundh45714e92001-06-26 16:39:36 +00005880 c1 = *s1++;
5881 c2 = *s2++;
5882
5883 if (c1 != c2)
5884 return (c1 < c2) ? -1 : 1;
5885
Marc-André Lemburge5034372000-08-08 08:04:29 +00005886 len1--; len2--;
5887 }
5888
5889 return (len1 < len2) ? -1 : (len1 != len2);
5890}
5891
5892#endif
5893
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894int PyUnicode_Compare(PyObject *left,
5895 PyObject *right)
5896{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005897 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5898 return unicode_compare((PyUnicodeObject *)left,
5899 (PyUnicodeObject *)right);
5900 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5901 (PyUnicode_Check(left) && PyString_Check(right))) {
5902 if (PyUnicode_Check(left))
5903 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5904 if (PyUnicode_Check(right))
5905 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5906 assert(PyString_Check(left));
5907 assert(PyString_Check(right));
5908 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005910 PyErr_Format(PyExc_TypeError,
5911 "Can't compare %.100s and %.100s",
5912 left->ob_type->tp_name,
5913 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 return -1;
5915}
5916
Martin v. Löwis5b222132007-06-10 09:51:05 +00005917int
5918PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
5919{
5920 int i;
5921 Py_UNICODE *id;
5922 assert(PyUnicode_Check(uni));
5923 id = PyUnicode_AS_UNICODE(uni);
5924 /* Compare Unicode string and source character set string */
5925 for (i = 0; id[i] && str[i]; i++)
5926 if (id[i] != str[i])
5927 return ((int)id[i] < (int)str[i]) ? -1 : 1;
5928 if (id[i])
5929 return 1; /* uni is longer */
5930 if (str[i])
5931 return -1; /* str is longer */
5932 return 0;
5933}
5934
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005935PyObject *PyUnicode_RichCompare(PyObject *left,
5936 PyObject *right,
5937 int op)
5938{
5939 int result;
5940
5941 result = PyUnicode_Compare(left, right);
5942 if (result == -1 && PyErr_Occurred())
5943 goto onError;
5944
5945 /* Convert the return value to a Boolean */
5946 switch (op) {
5947 case Py_EQ:
5948 result = (result == 0);
5949 break;
5950 case Py_NE:
5951 result = (result != 0);
5952 break;
5953 case Py_LE:
5954 result = (result <= 0);
5955 break;
5956 case Py_GE:
5957 result = (result >= 0);
5958 break;
5959 case Py_LT:
5960 result = (result == -1);
5961 break;
5962 case Py_GT:
5963 result = (result == 1);
5964 break;
5965 }
5966 return PyBool_FromLong(result);
5967
5968 onError:
5969
5970 /* Standard case
5971
5972 Type errors mean that PyUnicode_FromObject() could not convert
5973 one of the arguments (usually the right hand side) to Unicode,
5974 ie. we can't handle the comparison request. However, it is
5975 possible that the other object knows a comparison method, which
5976 is why we return Py_NotImplemented to give the other object a
5977 chance.
5978
5979 */
5980 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5981 PyErr_Clear();
5982 Py_INCREF(Py_NotImplemented);
5983 return Py_NotImplemented;
5984 }
5985 if (op != Py_EQ && op != Py_NE)
5986 return NULL;
5987
5988 /* Equality comparison.
5989
5990 This is a special case: we silence any PyExc_UnicodeDecodeError
5991 and instead turn it into a PyErr_UnicodeWarning.
5992
5993 */
5994 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5995 return NULL;
5996 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00005997 if (PyErr_WarnEx(PyExc_UnicodeWarning,
5998 (op == Py_EQ) ?
5999 "Unicode equal comparison "
6000 "failed to convert both arguments to Unicode - "
6001 "interpreting them as being unequal"
6002 :
6003 "Unicode unequal comparison "
6004 "failed to convert both arguments to Unicode - "
6005 "interpreting them as being unequal",
6006 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006007 return NULL;
6008 result = (op == Py_NE);
6009 return PyBool_FromLong(result);
6010}
6011
Guido van Rossum403d68b2000-03-13 15:55:09 +00006012int PyUnicode_Contains(PyObject *container,
6013 PyObject *element)
6014{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006015 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006016 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006017
6018 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006019 sub = PyUnicode_FromObject(element);
6020 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006021 PyErr_Format(PyExc_TypeError,
6022 "'in <string>' requires string as left operand, not %s",
6023 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006024 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006025 }
6026
Thomas Wouters477c8d52006-05-27 19:21:47 +00006027 str = PyUnicode_FromObject(container);
6028 if (!str) {
6029 Py_DECREF(sub);
6030 return -1;
6031 }
6032
6033 result = stringlib_contains_obj(str, sub);
6034
6035 Py_DECREF(str);
6036 Py_DECREF(sub);
6037
Guido van Rossum403d68b2000-03-13 15:55:09 +00006038 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006039}
6040
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041/* Concat to string or Unicode object giving a new Unicode object. */
6042
6043PyObject *PyUnicode_Concat(PyObject *left,
6044 PyObject *right)
6045{
6046 PyUnicodeObject *u = NULL, *v = NULL, *w;
6047
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006048 if (PyBytes_Check(left) || PyBytes_Check(right))
6049 return PyBytes_Concat(left, right);
6050
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 /* Coerce the two arguments */
6052 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6053 if (u == NULL)
6054 goto onError;
6055 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6056 if (v == NULL)
6057 goto onError;
6058
6059 /* Shortcuts */
6060 if (v == unicode_empty) {
6061 Py_DECREF(v);
6062 return (PyObject *)u;
6063 }
6064 if (u == unicode_empty) {
6065 Py_DECREF(u);
6066 return (PyObject *)v;
6067 }
6068
6069 /* Concat the two Unicode strings */
6070 w = _PyUnicode_New(u->length + v->length);
6071 if (w == NULL)
6072 goto onError;
6073 Py_UNICODE_COPY(w->str, u->str, u->length);
6074 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6075
6076 Py_DECREF(u);
6077 Py_DECREF(v);
6078 return (PyObject *)w;
6079
6080onError:
6081 Py_XDECREF(u);
6082 Py_XDECREF(v);
6083 return NULL;
6084}
6085
Walter Dörwald1ab83302007-05-18 17:15:44 +00006086void
6087PyUnicode_Append(PyObject **pleft, PyObject *right)
6088{
6089 PyObject *new;
6090 if (*pleft == NULL)
6091 return;
6092 if (right == NULL || !PyUnicode_Check(*pleft)) {
6093 Py_DECREF(*pleft);
6094 *pleft = NULL;
6095 return;
6096 }
6097 new = PyUnicode_Concat(*pleft, right);
6098 Py_DECREF(*pleft);
6099 *pleft = new;
6100}
6101
6102void
6103PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6104{
6105 PyUnicode_Append(pleft, right);
6106 Py_XDECREF(right);
6107}
6108
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006109PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110"S.count(sub[, start[, end]]) -> int\n\
6111\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006112Return the number of non-overlapping occurrences of substring sub in\n\
6113Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006114interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115
6116static PyObject *
6117unicode_count(PyUnicodeObject *self, PyObject *args)
6118{
6119 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006120 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006121 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 PyObject *result;
6123
Guido van Rossumb8872e62000-05-09 14:14:27 +00006124 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6125 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 return NULL;
6127
6128 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006129 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130 if (substring == NULL)
6131 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006132
Thomas Wouters477c8d52006-05-27 19:21:47 +00006133 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134
Thomas Wouters477c8d52006-05-27 19:21:47 +00006135 result = PyInt_FromSsize_t(
6136 stringlib_count(self->str + start, end - start,
6137 substring->str, substring->length)
6138 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139
6140 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006141
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 return result;
6143}
6144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006145PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006146"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006148Encodes S using the codec registered for encoding. encoding defaults\n\
6149to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006150handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006151a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6152'xmlcharrefreplace' as well as any other name registered with\n\
6153codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154
6155static PyObject *
6156unicode_encode(PyUnicodeObject *self, PyObject *args)
6157{
6158 char *encoding = NULL;
6159 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006160 PyObject *v;
6161
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6163 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006164 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006165 if (v == NULL)
6166 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006167 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006168 if (PyString_Check(v)) {
6169 /* Old codec, turn it into bytes */
6170 PyObject *b = PyBytes_FromObject(v);
6171 Py_DECREF(v);
6172 return b;
6173 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006174 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006175 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006176 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006177 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006178 Py_DECREF(v);
6179 return NULL;
6180 }
6181 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006182
6183 onError:
6184 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006185}
6186
6187PyDoc_STRVAR(decode__doc__,
6188"S.decode([encoding[,errors]]) -> string or unicode\n\
6189\n\
6190Decodes S using the codec registered for encoding. encoding defaults\n\
6191to the default encoding. errors may be given to set a different error\n\
6192handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6193a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6194as well as any other name registerd with codecs.register_error that is\n\
6195able to handle UnicodeDecodeErrors.");
6196
6197static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006198unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006199{
6200 char *encoding = NULL;
6201 char *errors = NULL;
6202 PyObject *v;
6203
6204 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6205 return NULL;
6206 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006207 if (v == NULL)
6208 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006209 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6210 PyErr_Format(PyExc_TypeError,
6211 "decoder did not return a string/unicode object "
6212 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006213 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006214 Py_DECREF(v);
6215 return NULL;
6216 }
6217 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006218
6219 onError:
6220 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221}
6222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006223PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224"S.expandtabs([tabsize]) -> unicode\n\
6225\n\
6226Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006227If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228
6229static PyObject*
6230unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6231{
6232 Py_UNICODE *e;
6233 Py_UNICODE *p;
6234 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006235 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 PyUnicodeObject *u;
6237 int tabsize = 8;
6238
6239 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6240 return NULL;
6241
Thomas Wouters7e474022000-07-16 12:04:32 +00006242 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006243 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 e = self->str + self->length;
6245 for (p = self->str; p < e; p++)
6246 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006247 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006249 if (old_j > j) {
6250 PyErr_SetString(PyExc_OverflowError,
6251 "new string is too long");
6252 return NULL;
6253 }
6254 old_j = j;
6255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256 }
6257 else {
6258 j++;
6259 if (*p == '\n' || *p == '\r') {
6260 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006261 old_j = j = 0;
6262 if (i < 0) {
6263 PyErr_SetString(PyExc_OverflowError,
6264 "new string is too long");
6265 return NULL;
6266 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267 }
6268 }
6269
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006270 if ((i + j) < 0) {
6271 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6272 return NULL;
6273 }
6274
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 /* Second pass: create output string and fill it */
6276 u = _PyUnicode_New(i + j);
6277 if (!u)
6278 return NULL;
6279
6280 j = 0;
6281 q = u->str;
6282
6283 for (p = self->str; p < e; p++)
6284 if (*p == '\t') {
6285 if (tabsize > 0) {
6286 i = tabsize - (j % tabsize);
6287 j += i;
6288 while (i--)
6289 *q++ = ' ';
6290 }
6291 }
6292 else {
6293 j++;
6294 *q++ = *p;
6295 if (*p == '\n' || *p == '\r')
6296 j = 0;
6297 }
6298
6299 return (PyObject*) u;
6300}
6301
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006302PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303"S.find(sub [,start [,end]]) -> int\n\
6304\n\
6305Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006306such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307arguments start and end are interpreted as in slice notation.\n\
6308\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006309Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310
6311static PyObject *
6312unicode_find(PyUnicodeObject *self, PyObject *args)
6313{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006314 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006315 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006316 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006317 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318
Guido van Rossumb8872e62000-05-09 14:14:27 +00006319 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6320 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006322 substring = PyUnicode_FromObject(substring);
6323 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324 return NULL;
6325
Thomas Wouters477c8d52006-05-27 19:21:47 +00006326 result = stringlib_find_slice(
6327 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6328 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6329 start, end
6330 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331
6332 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006333
6334 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335}
6336
6337static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006338unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339{
6340 if (index < 0 || index >= self->length) {
6341 PyErr_SetString(PyExc_IndexError, "string index out of range");
6342 return NULL;
6343 }
6344
6345 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6346}
6347
6348static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006349unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006351 /* Since Unicode objects compare equal to their UTF-8 string
6352 counterparts, we hash the UTF-8 string. */
6353 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6354 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355}
6356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006357PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358"S.index(sub [,start [,end]]) -> int\n\
6359\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006360Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361
6362static PyObject *
6363unicode_index(PyUnicodeObject *self, PyObject *args)
6364{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006365 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006366 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006367 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006368 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369
Guido van Rossumb8872e62000-05-09 14:14:27 +00006370 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6371 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006373 substring = PyUnicode_FromObject(substring);
6374 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375 return NULL;
6376
Thomas Wouters477c8d52006-05-27 19:21:47 +00006377 result = stringlib_find_slice(
6378 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6379 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6380 start, end
6381 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382
6383 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006384
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385 if (result < 0) {
6386 PyErr_SetString(PyExc_ValueError, "substring not found");
6387 return NULL;
6388 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006389
Martin v. Löwis18e16552006-02-15 17:27:45 +00006390 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391}
6392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006393PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006394"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006396Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006397at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398
6399static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006400unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401{
6402 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6403 register const Py_UNICODE *e;
6404 int cased;
6405
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 /* Shortcut for single character strings */
6407 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006408 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006410 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006411 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006412 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006413
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 e = p + PyUnicode_GET_SIZE(self);
6415 cased = 0;
6416 for (; p < e; p++) {
6417 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006418
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006420 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 else if (!cased && Py_UNICODE_ISLOWER(ch))
6422 cased = 1;
6423 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006424 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425}
6426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006427PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006428"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006430Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006431at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432
6433static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006434unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435{
6436 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6437 register const Py_UNICODE *e;
6438 int cased;
6439
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 /* Shortcut for single character strings */
6441 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006442 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006444 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006445 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006446 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006447
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 e = p + PyUnicode_GET_SIZE(self);
6449 cased = 0;
6450 for (; p < e; p++) {
6451 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006452
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006454 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 else if (!cased && Py_UNICODE_ISUPPER(ch))
6456 cased = 1;
6457 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006458 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459}
6460
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006461PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006462"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006464Return True if S is a titlecased string and there is at least one\n\
6465character in S, i.e. upper- and titlecase characters may only\n\
6466follow uncased characters and lowercase characters only cased ones.\n\
6467Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468
6469static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006470unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471{
6472 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6473 register const Py_UNICODE *e;
6474 int cased, previous_is_cased;
6475
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 /* Shortcut for single character strings */
6477 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006478 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6479 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006481 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006482 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006483 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006484
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 e = p + PyUnicode_GET_SIZE(self);
6486 cased = 0;
6487 previous_is_cased = 0;
6488 for (; p < e; p++) {
6489 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006490
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6492 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006493 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494 previous_is_cased = 1;
6495 cased = 1;
6496 }
6497 else if (Py_UNICODE_ISLOWER(ch)) {
6498 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006499 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500 previous_is_cased = 1;
6501 cased = 1;
6502 }
6503 else
6504 previous_is_cased = 0;
6505 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006506 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507}
6508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006509PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006510"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006512Return True if all characters in S are whitespace\n\
6513and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514
6515static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006516unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517{
6518 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6519 register const Py_UNICODE *e;
6520
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521 /* Shortcut for single character strings */
6522 if (PyUnicode_GET_SIZE(self) == 1 &&
6523 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006524 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006526 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006527 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006528 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006529
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 e = p + PyUnicode_GET_SIZE(self);
6531 for (; p < e; p++) {
6532 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006533 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006535 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536}
6537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006538PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006539"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006540\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006541Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006542and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006543
6544static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006545unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006546{
6547 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6548 register const Py_UNICODE *e;
6549
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006550 /* Shortcut for single character strings */
6551 if (PyUnicode_GET_SIZE(self) == 1 &&
6552 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006553 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006554
6555 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006556 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006557 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006558
6559 e = p + PyUnicode_GET_SIZE(self);
6560 for (; p < e; p++) {
6561 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006562 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006563 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006564 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006565}
6566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006567PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006568"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006569\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006570Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006571and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006572
6573static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006574unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006575{
6576 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6577 register const Py_UNICODE *e;
6578
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006579 /* Shortcut for single character strings */
6580 if (PyUnicode_GET_SIZE(self) == 1 &&
6581 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006582 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006583
6584 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006585 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006586 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006587
6588 e = p + PyUnicode_GET_SIZE(self);
6589 for (; p < e; p++) {
6590 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006591 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006592 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006593 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006594}
6595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006596PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006597"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006599Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006600False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601
6602static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006603unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604{
6605 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6606 register const Py_UNICODE *e;
6607
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 /* Shortcut for single character strings */
6609 if (PyUnicode_GET_SIZE(self) == 1 &&
6610 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006611 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006613 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006614 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006615 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006616
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 e = p + PyUnicode_GET_SIZE(self);
6618 for (; p < e; p++) {
6619 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006620 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006622 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623}
6624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006625PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006626"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006628Return True if all characters in S are digits\n\
6629and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630
6631static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006632unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633{
6634 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6635 register const Py_UNICODE *e;
6636
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 /* Shortcut for single character strings */
6638 if (PyUnicode_GET_SIZE(self) == 1 &&
6639 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006640 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006642 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006643 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006644 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006645
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 e = p + PyUnicode_GET_SIZE(self);
6647 for (; p < e; p++) {
6648 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006649 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006651 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652}
6653
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006654PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006655"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006657Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006658False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659
6660static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006661unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662{
6663 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6664 register const Py_UNICODE *e;
6665
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 /* Shortcut for single character strings */
6667 if (PyUnicode_GET_SIZE(self) == 1 &&
6668 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006669 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006671 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006672 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006673 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006674
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675 e = p + PyUnicode_GET_SIZE(self);
6676 for (; p < e; p++) {
6677 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006678 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006680 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681}
6682
Martin v. Löwis47383402007-08-15 07:32:56 +00006683int
6684PyUnicode_IsIdentifier(PyObject *self)
6685{
6686 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6687 register const Py_UNICODE *e;
6688
6689 /* Special case for empty strings */
6690 if (PyUnicode_GET_SIZE(self) == 0)
6691 return 0;
6692
6693 /* PEP 3131 says that the first character must be in
6694 XID_Start and subsequent characters in XID_Continue,
6695 and for the ASCII range, the 2.x rules apply (i.e
6696 start with letters and underscore, continue with
6697 letters, digits, underscore). However, given the current
6698 definition of XID_Start and XID_Continue, it is sufficient
6699 to check just for these, except that _ must be allowed
6700 as starting an identifier. */
6701 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
6702 return 0;
6703
6704 e = p + PyUnicode_GET_SIZE(self);
6705 for (p++; p < e; p++) {
6706 if (!_PyUnicode_IsXidContinue(*p))
6707 return 0;
6708 }
6709 return 1;
6710}
6711
6712PyDoc_STRVAR(isidentifier__doc__,
6713"S.isidentifier() -> bool\n\
6714\n\
6715Return True if S is a valid identifier according\n\
6716to the language definition.");
6717
6718static PyObject*
6719unicode_isidentifier(PyObject *self)
6720{
6721 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
6722}
6723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006724PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725"S.join(sequence) -> unicode\n\
6726\n\
6727Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006728sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729
6730static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006731unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006733 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734}
6735
Martin v. Löwis18e16552006-02-15 17:27:45 +00006736static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737unicode_length(PyUnicodeObject *self)
6738{
6739 return self->length;
6740}
6741
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006742PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006743"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744\n\
6745Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006746done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747
6748static PyObject *
6749unicode_ljust(PyUnicodeObject *self, PyObject *args)
6750{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006751 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006752 Py_UNICODE fillchar = ' ';
6753
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006754 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755 return NULL;
6756
Tim Peters7a29bd52001-09-12 03:03:31 +00006757 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758 Py_INCREF(self);
6759 return (PyObject*) self;
6760 }
6761
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006762 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763}
6764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006765PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766"S.lower() -> unicode\n\
6767\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006768Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769
6770static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006771unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 return fixup(self, fixlower);
6774}
6775
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006776#define LEFTSTRIP 0
6777#define RIGHTSTRIP 1
6778#define BOTHSTRIP 2
6779
6780/* Arrays indexed by above */
6781static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6782
6783#define STRIPNAME(i) (stripformat[i]+3)
6784
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006785/* externally visible for str.strip(unicode) */
6786PyObject *
6787_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6788{
6789 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006790 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006791 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006792 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6793 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006794
Thomas Wouters477c8d52006-05-27 19:21:47 +00006795 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6796
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006797 i = 0;
6798 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006799 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6800 i++;
6801 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006802 }
6803
6804 j = len;
6805 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006806 do {
6807 j--;
6808 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6809 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006810 }
6811
6812 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006813 Py_INCREF(self);
6814 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006815 }
6816 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006817 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006818}
6819
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820
6821static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006822do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006824 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006825 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006826
6827 i = 0;
6828 if (striptype != RIGHTSTRIP) {
6829 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6830 i++;
6831 }
6832 }
6833
6834 j = len;
6835 if (striptype != LEFTSTRIP) {
6836 do {
6837 j--;
6838 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6839 j++;
6840 }
6841
6842 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6843 Py_INCREF(self);
6844 return (PyObject*)self;
6845 }
6846 else
6847 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848}
6849
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006850
6851static PyObject *
6852do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6853{
6854 PyObject *sep = NULL;
6855
6856 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6857 return NULL;
6858
6859 if (sep != NULL && sep != Py_None) {
6860 if (PyUnicode_Check(sep))
6861 return _PyUnicode_XStrip(self, striptype, sep);
6862 else if (PyString_Check(sep)) {
6863 PyObject *res;
6864 sep = PyUnicode_FromObject(sep);
6865 if (sep==NULL)
6866 return NULL;
6867 res = _PyUnicode_XStrip(self, striptype, sep);
6868 Py_DECREF(sep);
6869 return res;
6870 }
6871 else {
6872 PyErr_Format(PyExc_TypeError,
6873 "%s arg must be None, unicode or str",
6874 STRIPNAME(striptype));
6875 return NULL;
6876 }
6877 }
6878
6879 return do_strip(self, striptype);
6880}
6881
6882
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006883PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006884"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006885\n\
6886Return a copy of the string S with leading and trailing\n\
6887whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006888If chars is given and not None, remove characters in chars instead.\n\
6889If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006890
6891static PyObject *
6892unicode_strip(PyUnicodeObject *self, PyObject *args)
6893{
6894 if (PyTuple_GET_SIZE(args) == 0)
6895 return do_strip(self, BOTHSTRIP); /* Common case */
6896 else
6897 return do_argstrip(self, BOTHSTRIP, args);
6898}
6899
6900
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006901PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006902"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006903\n\
6904Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006905If chars is given and not None, remove characters in chars instead.\n\
6906If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006907
6908static PyObject *
6909unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6910{
6911 if (PyTuple_GET_SIZE(args) == 0)
6912 return do_strip(self, LEFTSTRIP); /* Common case */
6913 else
6914 return do_argstrip(self, LEFTSTRIP, args);
6915}
6916
6917
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006918PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006919"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006920\n\
6921Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006922If chars is given and not None, remove characters in chars instead.\n\
6923If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006924
6925static PyObject *
6926unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6927{
6928 if (PyTuple_GET_SIZE(args) == 0)
6929 return do_strip(self, RIGHTSTRIP); /* Common case */
6930 else
6931 return do_argstrip(self, RIGHTSTRIP, args);
6932}
6933
6934
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006936unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937{
6938 PyUnicodeObject *u;
6939 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006940 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006941 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942
6943 if (len < 0)
6944 len = 0;
6945
Tim Peters7a29bd52001-09-12 03:03:31 +00006946 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 /* no repeat, return original string */
6948 Py_INCREF(str);
6949 return (PyObject*) str;
6950 }
Tim Peters8f422462000-09-09 06:13:41 +00006951
6952 /* ensure # of chars needed doesn't overflow int and # of bytes
6953 * needed doesn't overflow size_t
6954 */
6955 nchars = len * str->length;
6956 if (len && nchars / len != str->length) {
6957 PyErr_SetString(PyExc_OverflowError,
6958 "repeated string is too long");
6959 return NULL;
6960 }
6961 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6962 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6963 PyErr_SetString(PyExc_OverflowError,
6964 "repeated string is too long");
6965 return NULL;
6966 }
6967 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968 if (!u)
6969 return NULL;
6970
6971 p = u->str;
6972
Thomas Wouters477c8d52006-05-27 19:21:47 +00006973 if (str->length == 1 && len > 0) {
6974 Py_UNICODE_FILL(p, str->str[0], len);
6975 } else {
6976 Py_ssize_t done = 0; /* number of characters copied this far */
6977 if (done < nchars) {
6978 Py_UNICODE_COPY(p, str->str, str->length);
6979 done = str->length;
6980 }
6981 while (done < nchars) {
6982 int n = (done <= nchars-done) ? done : nchars-done;
6983 Py_UNICODE_COPY(p+done, p, n);
6984 done += n;
6985 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986 }
6987
6988 return (PyObject*) u;
6989}
6990
6991PyObject *PyUnicode_Replace(PyObject *obj,
6992 PyObject *subobj,
6993 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006994 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995{
6996 PyObject *self;
6997 PyObject *str1;
6998 PyObject *str2;
6999 PyObject *result;
7000
7001 self = PyUnicode_FromObject(obj);
7002 if (self == NULL)
7003 return NULL;
7004 str1 = PyUnicode_FromObject(subobj);
7005 if (str1 == NULL) {
7006 Py_DECREF(self);
7007 return NULL;
7008 }
7009 str2 = PyUnicode_FromObject(replobj);
7010 if (str2 == NULL) {
7011 Py_DECREF(self);
7012 Py_DECREF(str1);
7013 return NULL;
7014 }
Tim Petersced69f82003-09-16 20:30:58 +00007015 result = replace((PyUnicodeObject *)self,
7016 (PyUnicodeObject *)str1,
7017 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018 maxcount);
7019 Py_DECREF(self);
7020 Py_DECREF(str1);
7021 Py_DECREF(str2);
7022 return result;
7023}
7024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007025PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026"S.replace (old, new[, maxsplit]) -> unicode\n\
7027\n\
7028Return a copy of S with all occurrences of substring\n\
7029old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007030given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031
7032static PyObject*
7033unicode_replace(PyUnicodeObject *self, PyObject *args)
7034{
7035 PyUnicodeObject *str1;
7036 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007037 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 PyObject *result;
7039
Martin v. Löwis18e16552006-02-15 17:27:45 +00007040 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041 return NULL;
7042 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7043 if (str1 == NULL)
7044 return NULL;
7045 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007046 if (str2 == NULL) {
7047 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007049 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050
7051 result = replace(self, str1, str2, maxcount);
7052
7053 Py_DECREF(str1);
7054 Py_DECREF(str2);
7055 return result;
7056}
7057
7058static
7059PyObject *unicode_repr(PyObject *unicode)
7060{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007061 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007062 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007063 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7064 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7065
7066 /* XXX(nnorwitz): rather than over-allocating, it would be
7067 better to choose a different scheme. Perhaps scan the
7068 first N-chars of the string and allocate based on that size.
7069 */
7070 /* Initial allocation is based on the longest-possible unichr
7071 escape.
7072
7073 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7074 unichr, so in this case it's the longest unichr escape. In
7075 narrow (UTF-16) builds this is five chars per source unichr
7076 since there are two unichrs in the surrogate pair, so in narrow
7077 (UTF-16) builds it's not the longest unichr escape.
7078
7079 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7080 so in the narrow (UTF-16) build case it's the longest unichr
7081 escape.
7082 */
7083
Walter Dörwald1ab83302007-05-18 17:15:44 +00007084 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007085 2 /* quotes */
7086#ifdef Py_UNICODE_WIDE
7087 + 10*size
7088#else
7089 + 6*size
7090#endif
7091 + 1);
7092 if (repr == NULL)
7093 return NULL;
7094
Walter Dörwald1ab83302007-05-18 17:15:44 +00007095 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007096
7097 /* Add quote */
7098 *p++ = (findchar(s, size, '\'') &&
7099 !findchar(s, size, '"')) ? '"' : '\'';
7100 while (size-- > 0) {
7101 Py_UNICODE ch = *s++;
7102
7103 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007104 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007105 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007106 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007107 continue;
7108 }
7109
7110#ifdef Py_UNICODE_WIDE
7111 /* Map 21-bit characters to '\U00xxxxxx' */
7112 else if (ch >= 0x10000) {
7113 *p++ = '\\';
7114 *p++ = 'U';
7115 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7116 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7117 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7118 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7119 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7120 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7121 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7122 *p++ = hexdigits[ch & 0x0000000F];
7123 continue;
7124 }
7125#else
7126 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7127 else if (ch >= 0xD800 && ch < 0xDC00) {
7128 Py_UNICODE ch2;
7129 Py_UCS4 ucs;
7130
7131 ch2 = *s++;
7132 size--;
7133 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7134 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7135 *p++ = '\\';
7136 *p++ = 'U';
7137 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7138 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7139 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7140 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7141 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7142 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7143 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7144 *p++ = hexdigits[ucs & 0x0000000F];
7145 continue;
7146 }
7147 /* Fall through: isolated surrogates are copied as-is */
7148 s--;
7149 size++;
7150 }
7151#endif
7152
7153 /* Map 16-bit characters to '\uxxxx' */
7154 if (ch >= 256) {
7155 *p++ = '\\';
7156 *p++ = 'u';
7157 *p++ = hexdigits[(ch >> 12) & 0x000F];
7158 *p++ = hexdigits[(ch >> 8) & 0x000F];
7159 *p++ = hexdigits[(ch >> 4) & 0x000F];
7160 *p++ = hexdigits[ch & 0x000F];
7161 }
7162
7163 /* Map special whitespace to '\t', \n', '\r' */
7164 else if (ch == '\t') {
7165 *p++ = '\\';
7166 *p++ = 't';
7167 }
7168 else if (ch == '\n') {
7169 *p++ = '\\';
7170 *p++ = 'n';
7171 }
7172 else if (ch == '\r') {
7173 *p++ = '\\';
7174 *p++ = 'r';
7175 }
7176
7177 /* Map non-printable US ASCII to '\xhh' */
7178 else if (ch < ' ' || ch >= 0x7F) {
7179 *p++ = '\\';
7180 *p++ = 'x';
7181 *p++ = hexdigits[(ch >> 4) & 0x000F];
7182 *p++ = hexdigits[ch & 0x000F];
7183 }
7184
7185 /* Copy everything else as-is */
7186 else
7187 *p++ = (char) ch;
7188 }
7189 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007190 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007191
7192 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007193 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007194 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195}
7196
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007197PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198"S.rfind(sub [,start [,end]]) -> int\n\
7199\n\
7200Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007201such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202arguments start and end are interpreted as in slice notation.\n\
7203\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007204Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205
7206static PyObject *
7207unicode_rfind(PyUnicodeObject *self, PyObject *args)
7208{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007209 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007210 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007211 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007212 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213
Guido van Rossumb8872e62000-05-09 14:14:27 +00007214 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7215 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007217 substring = PyUnicode_FromObject(substring);
7218 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219 return NULL;
7220
Thomas Wouters477c8d52006-05-27 19:21:47 +00007221 result = stringlib_rfind_slice(
7222 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7223 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7224 start, end
7225 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226
7227 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007228
7229 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230}
7231
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007232PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233"S.rindex(sub [,start [,end]]) -> int\n\
7234\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007235Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236
7237static PyObject *
7238unicode_rindex(PyUnicodeObject *self, PyObject *args)
7239{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007240 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007241 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007242 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007243 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244
Guido van Rossumb8872e62000-05-09 14:14:27 +00007245 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7246 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007248 substring = PyUnicode_FromObject(substring);
7249 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250 return NULL;
7251
Thomas Wouters477c8d52006-05-27 19:21:47 +00007252 result = stringlib_rfind_slice(
7253 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7254 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7255 start, end
7256 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257
7258 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007259
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260 if (result < 0) {
7261 PyErr_SetString(PyExc_ValueError, "substring not found");
7262 return NULL;
7263 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007264 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265}
7266
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007267PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007268"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269\n\
7270Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007271done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272
7273static PyObject *
7274unicode_rjust(PyUnicodeObject *self, PyObject *args)
7275{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007276 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007277 Py_UNICODE fillchar = ' ';
7278
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007279 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280 return NULL;
7281
Tim Peters7a29bd52001-09-12 03:03:31 +00007282 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283 Py_INCREF(self);
7284 return (PyObject*) self;
7285 }
7286
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007287 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288}
7289
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007291unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292{
7293 /* standard clamping */
7294 if (start < 0)
7295 start = 0;
7296 if (end < 0)
7297 end = 0;
7298 if (end > self->length)
7299 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007300 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301 /* full slice, return original string */
7302 Py_INCREF(self);
7303 return (PyObject*) self;
7304 }
7305 if (start > end)
7306 start = end;
7307 /* copy slice */
7308 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7309 end - start);
7310}
7311
7312PyObject *PyUnicode_Split(PyObject *s,
7313 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007314 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315{
7316 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007317
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318 s = PyUnicode_FromObject(s);
7319 if (s == NULL)
7320 return NULL;
7321 if (sep != NULL) {
7322 sep = PyUnicode_FromObject(sep);
7323 if (sep == NULL) {
7324 Py_DECREF(s);
7325 return NULL;
7326 }
7327 }
7328
7329 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7330
7331 Py_DECREF(s);
7332 Py_XDECREF(sep);
7333 return result;
7334}
7335
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007336PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337"S.split([sep [,maxsplit]]) -> list of strings\n\
7338\n\
7339Return a list of the words in S, using sep as the\n\
7340delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007341splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007342any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343
7344static PyObject*
7345unicode_split(PyUnicodeObject *self, PyObject *args)
7346{
7347 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007348 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349
Martin v. Löwis18e16552006-02-15 17:27:45 +00007350 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351 return NULL;
7352
7353 if (substring == Py_None)
7354 return split(self, NULL, maxcount);
7355 else if (PyUnicode_Check(substring))
7356 return split(self, (PyUnicodeObject *)substring, maxcount);
7357 else
7358 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7359}
7360
Thomas Wouters477c8d52006-05-27 19:21:47 +00007361PyObject *
7362PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7363{
7364 PyObject* str_obj;
7365 PyObject* sep_obj;
7366 PyObject* out;
7367
7368 str_obj = PyUnicode_FromObject(str_in);
7369 if (!str_obj)
7370 return NULL;
7371 sep_obj = PyUnicode_FromObject(sep_in);
7372 if (!sep_obj) {
7373 Py_DECREF(str_obj);
7374 return NULL;
7375 }
7376
7377 out = stringlib_partition(
7378 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7379 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7380 );
7381
7382 Py_DECREF(sep_obj);
7383 Py_DECREF(str_obj);
7384
7385 return out;
7386}
7387
7388
7389PyObject *
7390PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7391{
7392 PyObject* str_obj;
7393 PyObject* sep_obj;
7394 PyObject* out;
7395
7396 str_obj = PyUnicode_FromObject(str_in);
7397 if (!str_obj)
7398 return NULL;
7399 sep_obj = PyUnicode_FromObject(sep_in);
7400 if (!sep_obj) {
7401 Py_DECREF(str_obj);
7402 return NULL;
7403 }
7404
7405 out = stringlib_rpartition(
7406 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7407 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7408 );
7409
7410 Py_DECREF(sep_obj);
7411 Py_DECREF(str_obj);
7412
7413 return out;
7414}
7415
7416PyDoc_STRVAR(partition__doc__,
7417"S.partition(sep) -> (head, sep, tail)\n\
7418\n\
7419Searches for the separator sep in S, and returns the part before it,\n\
7420the separator itself, and the part after it. If the separator is not\n\
7421found, returns S and two empty strings.");
7422
7423static PyObject*
7424unicode_partition(PyUnicodeObject *self, PyObject *separator)
7425{
7426 return PyUnicode_Partition((PyObject *)self, separator);
7427}
7428
7429PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007430"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007431\n\
7432Searches for the separator sep in S, starting at the end of S, and returns\n\
7433the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007434separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007435
7436static PyObject*
7437unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7438{
7439 return PyUnicode_RPartition((PyObject *)self, separator);
7440}
7441
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007442PyObject *PyUnicode_RSplit(PyObject *s,
7443 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007444 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007445{
7446 PyObject *result;
7447
7448 s = PyUnicode_FromObject(s);
7449 if (s == NULL)
7450 return NULL;
7451 if (sep != NULL) {
7452 sep = PyUnicode_FromObject(sep);
7453 if (sep == NULL) {
7454 Py_DECREF(s);
7455 return NULL;
7456 }
7457 }
7458
7459 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7460
7461 Py_DECREF(s);
7462 Py_XDECREF(sep);
7463 return result;
7464}
7465
7466PyDoc_STRVAR(rsplit__doc__,
7467"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7468\n\
7469Return a list of the words in S, using sep as the\n\
7470delimiter string, starting at the end of the string and\n\
7471working to the front. If maxsplit is given, at most maxsplit\n\
7472splits are done. If sep is not specified, any whitespace string\n\
7473is a separator.");
7474
7475static PyObject*
7476unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7477{
7478 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007479 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007480
Martin v. Löwis18e16552006-02-15 17:27:45 +00007481 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007482 return NULL;
7483
7484 if (substring == Py_None)
7485 return rsplit(self, NULL, maxcount);
7486 else if (PyUnicode_Check(substring))
7487 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7488 else
7489 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7490}
7491
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007492PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007493"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494\n\
7495Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007496Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007497is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498
7499static PyObject*
7500unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7501{
Guido van Rossum86662912000-04-11 15:38:46 +00007502 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503
Guido van Rossum86662912000-04-11 15:38:46 +00007504 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505 return NULL;
7506
Guido van Rossum86662912000-04-11 15:38:46 +00007507 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508}
7509
7510static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007511PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512{
Walter Dörwald346737f2007-05-31 10:44:43 +00007513 if (PyUnicode_CheckExact(self)) {
7514 Py_INCREF(self);
7515 return self;
7516 } else
7517 /* Subtype -- return genuine unicode string with the same value. */
7518 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7519 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520}
7521
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007522PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523"S.swapcase() -> unicode\n\
7524\n\
7525Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007526and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527
7528static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007529unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531 return fixup(self, fixswapcase);
7532}
7533
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007534PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535"S.translate(table) -> unicode\n\
7536\n\
7537Return a copy of the string S, where all characters have been mapped\n\
7538through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007539Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7540Unmapped characters are left untouched. Characters mapped to None\n\
7541are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542
7543static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007544unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545{
Tim Petersced69f82003-09-16 20:30:58 +00007546 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007548 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 "ignore");
7550}
7551
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007552PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553"S.upper() -> unicode\n\
7554\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007555Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556
7557static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007558unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560 return fixup(self, fixupper);
7561}
7562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007563PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564"S.zfill(width) -> unicode\n\
7565\n\
7566Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007567of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568
7569static PyObject *
7570unicode_zfill(PyUnicodeObject *self, PyObject *args)
7571{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007572 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573 PyUnicodeObject *u;
7574
Martin v. Löwis18e16552006-02-15 17:27:45 +00007575 Py_ssize_t width;
7576 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577 return NULL;
7578
7579 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007580 if (PyUnicode_CheckExact(self)) {
7581 Py_INCREF(self);
7582 return (PyObject*) self;
7583 }
7584 else
7585 return PyUnicode_FromUnicode(
7586 PyUnicode_AS_UNICODE(self),
7587 PyUnicode_GET_SIZE(self)
7588 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589 }
7590
7591 fill = width - self->length;
7592
7593 u = pad(self, fill, 0, '0');
7594
Walter Dörwald068325e2002-04-15 13:36:47 +00007595 if (u == NULL)
7596 return NULL;
7597
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598 if (u->str[fill] == '+' || u->str[fill] == '-') {
7599 /* move sign to beginning of string */
7600 u->str[0] = u->str[fill];
7601 u->str[fill] = '0';
7602 }
7603
7604 return (PyObject*) u;
7605}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606
7607#if 0
7608static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007609unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611 return PyInt_FromLong(unicode_freelist_size);
7612}
7613#endif
7614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007615PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007616"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007618Return True if S starts with the specified prefix, False otherwise.\n\
7619With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007620With optional end, stop comparing S at that position.\n\
7621prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622
7623static PyObject *
7624unicode_startswith(PyUnicodeObject *self,
7625 PyObject *args)
7626{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007627 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007629 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007630 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007631 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007633 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007634 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007635 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007636 if (PyTuple_Check(subobj)) {
7637 Py_ssize_t i;
7638 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7639 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7640 PyTuple_GET_ITEM(subobj, i));
7641 if (substring == NULL)
7642 return NULL;
7643 result = tailmatch(self, substring, start, end, -1);
7644 Py_DECREF(substring);
7645 if (result) {
7646 Py_RETURN_TRUE;
7647 }
7648 }
7649 /* nothing matched */
7650 Py_RETURN_FALSE;
7651 }
7652 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007654 return NULL;
7655 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007656 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007657 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658}
7659
7660
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007661PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007662"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007664Return True if S ends with the specified suffix, False otherwise.\n\
7665With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007666With optional end, stop comparing S at that position.\n\
7667suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007668
7669static PyObject *
7670unicode_endswith(PyUnicodeObject *self,
7671 PyObject *args)
7672{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007673 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007675 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007676 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007677 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007679 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7680 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007682 if (PyTuple_Check(subobj)) {
7683 Py_ssize_t i;
7684 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7685 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7686 PyTuple_GET_ITEM(subobj, i));
7687 if (substring == NULL)
7688 return NULL;
7689 result = tailmatch(self, substring, start, end, +1);
7690 Py_DECREF(substring);
7691 if (result) {
7692 Py_RETURN_TRUE;
7693 }
7694 }
7695 Py_RETURN_FALSE;
7696 }
7697 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007699 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007701 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007703 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704}
7705
7706
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007707
7708static PyObject *
7709unicode_getnewargs(PyUnicodeObject *v)
7710{
7711 return Py_BuildValue("(u#)", v->str, v->length);
7712}
7713
7714
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715static PyMethodDef unicode_methods[] = {
7716
7717 /* Order is according to common usage: often used methods should
7718 appear first, since lookup is done sequentially. */
7719
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007720 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7721 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7722 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007723 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007724 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7725 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7726 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7727 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7728 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7729 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7730 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007731 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007732 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7733 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7734 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007735 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007736 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007737/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7738 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7739 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7740 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007741 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007742 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007743 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007744 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007745 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7746 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7747 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7748 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7749 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7750 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7751 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7752 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7753 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7754 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7755 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7756 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7757 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7758 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00007759 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007760 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007761#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007762 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763#endif
7764
7765#if 0
7766 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007767 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768#endif
7769
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007770 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771 {NULL, NULL}
7772};
7773
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007774static PyObject *
7775unicode_mod(PyObject *v, PyObject *w)
7776{
7777 if (!PyUnicode_Check(v)) {
7778 Py_INCREF(Py_NotImplemented);
7779 return Py_NotImplemented;
7780 }
7781 return PyUnicode_Format(v, w);
7782}
7783
7784static PyNumberMethods unicode_as_number = {
7785 0, /*nb_add*/
7786 0, /*nb_subtract*/
7787 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007788 unicode_mod, /*nb_remainder*/
7789};
7790
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007792 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007793 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007794 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7795 (ssizeargfunc) unicode_getitem, /* sq_item */
7796 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797 0, /* sq_ass_item */
7798 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007799 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800};
7801
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007802static PyObject*
7803unicode_subscript(PyUnicodeObject* self, PyObject* item)
7804{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007805 if (PyIndex_Check(item)) {
7806 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007807 if (i == -1 && PyErr_Occurred())
7808 return NULL;
7809 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007810 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007811 return unicode_getitem(self, i);
7812 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007813 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007814 Py_UNICODE* source_buf;
7815 Py_UNICODE* result_buf;
7816 PyObject* result;
7817
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007818 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007819 &start, &stop, &step, &slicelength) < 0) {
7820 return NULL;
7821 }
7822
7823 if (slicelength <= 0) {
7824 return PyUnicode_FromUnicode(NULL, 0);
7825 } else {
7826 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007827 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7828 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007829
7830 if (result_buf == NULL)
7831 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007832
7833 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7834 result_buf[i] = source_buf[cur];
7835 }
Tim Petersced69f82003-09-16 20:30:58 +00007836
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007837 result = PyUnicode_FromUnicode(result_buf, slicelength);
7838 PyMem_FREE(result_buf);
7839 return result;
7840 }
7841 } else {
7842 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7843 return NULL;
7844 }
7845}
7846
7847static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007848 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007849 (binaryfunc)unicode_subscript, /* mp_subscript */
7850 (objobjargproc)0, /* mp_ass_subscript */
7851};
7852
Martin v. Löwis18e16552006-02-15 17:27:45 +00007853static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007855 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856 const void **ptr)
7857{
7858 if (index != 0) {
7859 PyErr_SetString(PyExc_SystemError,
7860 "accessing non-existent unicode segment");
7861 return -1;
7862 }
7863 *ptr = (void *) self->str;
7864 return PyUnicode_GET_DATA_SIZE(self);
7865}
7866
Martin v. Löwis18e16552006-02-15 17:27:45 +00007867static Py_ssize_t
7868unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869 const void **ptr)
7870{
7871 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007872 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873 return -1;
7874}
7875
7876static int
7877unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007878 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879{
7880 if (lenp)
7881 *lenp = PyUnicode_GET_DATA_SIZE(self);
7882 return 1;
7883}
7884
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007885static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007887 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888 const void **ptr)
7889{
7890 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007891
Guido van Rossumd57fd912000-03-10 22:53:23 +00007892 if (index != 0) {
7893 PyErr_SetString(PyExc_SystemError,
7894 "accessing non-existent unicode segment");
7895 return -1;
7896 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007897 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898 if (str == NULL)
7899 return -1;
7900 *ptr = (void *) PyString_AS_STRING(str);
7901 return PyString_GET_SIZE(str);
7902}
7903
7904/* Helpers for PyUnicode_Format() */
7905
7906static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007907getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007908{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007909 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007910 if (argidx < arglen) {
7911 (*p_argidx)++;
7912 if (arglen < 0)
7913 return args;
7914 else
7915 return PyTuple_GetItem(args, argidx);
7916 }
7917 PyErr_SetString(PyExc_TypeError,
7918 "not enough arguments for format string");
7919 return NULL;
7920}
7921
7922#define F_LJUST (1<<0)
7923#define F_SIGN (1<<1)
7924#define F_BLANK (1<<2)
7925#define F_ALT (1<<3)
7926#define F_ZERO (1<<4)
7927
Martin v. Löwis18e16552006-02-15 17:27:45 +00007928static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007929strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007931 register Py_ssize_t i;
7932 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933 for (i = len - 1; i >= 0; i--)
7934 buffer[i] = (Py_UNICODE) charbuffer[i];
7935
Guido van Rossumd57fd912000-03-10 22:53:23 +00007936 return len;
7937}
7938
Neal Norwitzfc76d632006-01-10 06:03:13 +00007939static int
7940doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7941{
Tim Peters15231542006-02-16 01:08:01 +00007942 Py_ssize_t result;
7943
Neal Norwitzfc76d632006-01-10 06:03:13 +00007944 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007945 result = strtounicode(buffer, (char *)buffer);
7946 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007947}
7948
7949static int
7950longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7951{
Tim Peters15231542006-02-16 01:08:01 +00007952 Py_ssize_t result;
7953
Neal Norwitzfc76d632006-01-10 06:03:13 +00007954 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007955 result = strtounicode(buffer, (char *)buffer);
7956 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007957}
7958
Guido van Rossum078151d2002-08-11 04:24:12 +00007959/* XXX To save some code duplication, formatfloat/long/int could have been
7960 shared with stringobject.c, converting from 8-bit to Unicode after the
7961 formatting is done. */
7962
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963static int
7964formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007965 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966 int flags,
7967 int prec,
7968 int type,
7969 PyObject *v)
7970{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007971 /* fmt = '%#.' + `prec` + `type`
7972 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973 char fmt[20];
7974 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007975
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976 x = PyFloat_AsDouble(v);
7977 if (x == -1.0 && PyErr_Occurred())
7978 return -1;
7979 if (prec < 0)
7980 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7982 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007983 /* Worst case length calc to ensure no buffer overrun:
7984
7985 'g' formats:
7986 fmt = %#.<prec>g
7987 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7988 for any double rep.)
7989 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7990
7991 'f' formats:
7992 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7993 len = 1 + 50 + 1 + prec = 52 + prec
7994
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007995 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007996 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007997
7998 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00007999 if (((type == 'g' || type == 'G') &&
8000 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008001 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008002 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008003 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008004 return -1;
8005 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008006 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8007 (flags&F_ALT) ? "#" : "",
8008 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008009 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010}
8011
Tim Peters38fd5b62000-09-21 05:43:11 +00008012static PyObject*
8013formatlong(PyObject *val, int flags, int prec, int type)
8014{
8015 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008016 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008017 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008018 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008019
8020 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8021 if (!str)
8022 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008023 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008024 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008025 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008026}
8027
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028static int
8029formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008030 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031 int flags,
8032 int prec,
8033 int type,
8034 PyObject *v)
8035{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008036 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008037 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8038 * + 1 + 1
8039 * = 24
8040 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008041 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008042 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 long x;
8044
8045 x = PyInt_AsLong(v);
8046 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008047 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008048 if (x < 0 && type == 'u') {
8049 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008050 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008051 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8052 sign = "-";
8053 else
8054 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008056 prec = 1;
8057
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008058 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8059 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008060 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008061 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008062 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008063 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008064 return -1;
8065 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008066
8067 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008068 (type == 'x' || type == 'X' || type == 'o')) {
8069 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008070 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008071 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008072 * - when 0 is being converted, the C standard leaves off
8073 * the '0x' or '0X', which is inconsistent with other
8074 * %#x/%#X conversions and inconsistent with Python's
8075 * hex() function
8076 * - there are platforms that violate the standard and
8077 * convert 0 with the '0x' or '0X'
8078 * (Metrowerks, Compaq Tru64)
8079 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008080 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008081 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008082 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008083 * We can achieve the desired consistency by inserting our
8084 * own '0x' or '0X' prefix, and substituting %x/%X in place
8085 * of %#x/%#X.
8086 *
8087 * Note that this is the same approach as used in
8088 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008089 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008090 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8091 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008092 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008093 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008094 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8095 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008096 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008097 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008098 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008099 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008100 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008101 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102}
8103
8104static int
8105formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008106 size_t buflen,
8107 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008109 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008110 if (PyUnicode_Check(v)) {
8111 if (PyUnicode_GET_SIZE(v) != 1)
8112 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008114 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008116 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008117 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008118 goto onError;
8119 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8120 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121
8122 else {
8123 /* Integer input truncated to a character */
8124 long x;
8125 x = PyInt_AsLong(v);
8126 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008127 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008128#ifdef Py_UNICODE_WIDE
8129 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008130 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008131 "%c arg not in range(0x110000) "
8132 "(wide Python build)");
8133 return -1;
8134 }
8135#else
8136 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008137 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008138 "%c arg not in range(0x10000) "
8139 "(narrow Python build)");
8140 return -1;
8141 }
8142#endif
8143 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144 }
8145 buf[1] = '\0';
8146 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008147
8148 onError:
8149 PyErr_SetString(PyExc_TypeError,
8150 "%c requires int or char");
8151 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008152}
8153
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008154/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8155
8156 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8157 chars are formatted. XXX This is a magic number. Each formatting
8158 routine does bounds checking to ensure no overflow, but a better
8159 solution may be to malloc a buffer of appropriate size for each
8160 format. For now, the current solution is sufficient.
8161*/
8162#define FORMATBUFLEN (size_t)120
8163
Guido van Rossumd57fd912000-03-10 22:53:23 +00008164PyObject *PyUnicode_Format(PyObject *format,
8165 PyObject *args)
8166{
8167 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008168 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169 int args_owned = 0;
8170 PyUnicodeObject *result = NULL;
8171 PyObject *dict = NULL;
8172 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008173
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174 if (format == NULL || args == NULL) {
8175 PyErr_BadInternalCall();
8176 return NULL;
8177 }
8178 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008179 if (uformat == NULL)
8180 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181 fmt = PyUnicode_AS_UNICODE(uformat);
8182 fmtcnt = PyUnicode_GET_SIZE(uformat);
8183
8184 reslen = rescnt = fmtcnt + 100;
8185 result = _PyUnicode_New(reslen);
8186 if (result == NULL)
8187 goto onError;
8188 res = PyUnicode_AS_UNICODE(result);
8189
8190 if (PyTuple_Check(args)) {
8191 arglen = PyTuple_Size(args);
8192 argidx = 0;
8193 }
8194 else {
8195 arglen = -1;
8196 argidx = -2;
8197 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008198 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008199 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200 dict = args;
8201
8202 while (--fmtcnt >= 0) {
8203 if (*fmt != '%') {
8204 if (--rescnt < 0) {
8205 rescnt = fmtcnt + 100;
8206 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008207 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008208 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8210 --rescnt;
8211 }
8212 *res++ = *fmt++;
8213 }
8214 else {
8215 /* Got a format specifier */
8216 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008217 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219 Py_UNICODE c = '\0';
8220 Py_UNICODE fill;
8221 PyObject *v = NULL;
8222 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008223 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008225 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008226 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227
8228 fmt++;
8229 if (*fmt == '(') {
8230 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008231 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232 PyObject *key;
8233 int pcount = 1;
8234
8235 if (dict == NULL) {
8236 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008237 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238 goto onError;
8239 }
8240 ++fmt;
8241 --fmtcnt;
8242 keystart = fmt;
8243 /* Skip over balanced parentheses */
8244 while (pcount > 0 && --fmtcnt >= 0) {
8245 if (*fmt == ')')
8246 --pcount;
8247 else if (*fmt == '(')
8248 ++pcount;
8249 fmt++;
8250 }
8251 keylen = fmt - keystart - 1;
8252 if (fmtcnt < 0 || pcount > 0) {
8253 PyErr_SetString(PyExc_ValueError,
8254 "incomplete format key");
8255 goto onError;
8256 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008257#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008258 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259 then looked up since Python uses strings to hold
8260 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008261 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262 key = PyUnicode_EncodeUTF8(keystart,
8263 keylen,
8264 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008265#else
8266 key = PyUnicode_FromUnicode(keystart, keylen);
8267#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268 if (key == NULL)
8269 goto onError;
8270 if (args_owned) {
8271 Py_DECREF(args);
8272 args_owned = 0;
8273 }
8274 args = PyObject_GetItem(dict, key);
8275 Py_DECREF(key);
8276 if (args == NULL) {
8277 goto onError;
8278 }
8279 args_owned = 1;
8280 arglen = -1;
8281 argidx = -2;
8282 }
8283 while (--fmtcnt >= 0) {
8284 switch (c = *fmt++) {
8285 case '-': flags |= F_LJUST; continue;
8286 case '+': flags |= F_SIGN; continue;
8287 case ' ': flags |= F_BLANK; continue;
8288 case '#': flags |= F_ALT; continue;
8289 case '0': flags |= F_ZERO; continue;
8290 }
8291 break;
8292 }
8293 if (c == '*') {
8294 v = getnextarg(args, arglen, &argidx);
8295 if (v == NULL)
8296 goto onError;
8297 if (!PyInt_Check(v)) {
8298 PyErr_SetString(PyExc_TypeError,
8299 "* wants int");
8300 goto onError;
8301 }
8302 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008303 if (width == -1 && PyErr_Occurred())
8304 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305 if (width < 0) {
8306 flags |= F_LJUST;
8307 width = -width;
8308 }
8309 if (--fmtcnt >= 0)
8310 c = *fmt++;
8311 }
8312 else if (c >= '0' && c <= '9') {
8313 width = c - '0';
8314 while (--fmtcnt >= 0) {
8315 c = *fmt++;
8316 if (c < '0' || c > '9')
8317 break;
8318 if ((width*10) / 10 != width) {
8319 PyErr_SetString(PyExc_ValueError,
8320 "width too big");
8321 goto onError;
8322 }
8323 width = width*10 + (c - '0');
8324 }
8325 }
8326 if (c == '.') {
8327 prec = 0;
8328 if (--fmtcnt >= 0)
8329 c = *fmt++;
8330 if (c == '*') {
8331 v = getnextarg(args, arglen, &argidx);
8332 if (v == NULL)
8333 goto onError;
8334 if (!PyInt_Check(v)) {
8335 PyErr_SetString(PyExc_TypeError,
8336 "* wants int");
8337 goto onError;
8338 }
8339 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008340 if (prec == -1 && PyErr_Occurred())
8341 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342 if (prec < 0)
8343 prec = 0;
8344 if (--fmtcnt >= 0)
8345 c = *fmt++;
8346 }
8347 else if (c >= '0' && c <= '9') {
8348 prec = c - '0';
8349 while (--fmtcnt >= 0) {
8350 c = Py_CHARMASK(*fmt++);
8351 if (c < '0' || c > '9')
8352 break;
8353 if ((prec*10) / 10 != prec) {
8354 PyErr_SetString(PyExc_ValueError,
8355 "prec too big");
8356 goto onError;
8357 }
8358 prec = prec*10 + (c - '0');
8359 }
8360 }
8361 } /* prec */
8362 if (fmtcnt >= 0) {
8363 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364 if (--fmtcnt >= 0)
8365 c = *fmt++;
8366 }
8367 }
8368 if (fmtcnt < 0) {
8369 PyErr_SetString(PyExc_ValueError,
8370 "incomplete format");
8371 goto onError;
8372 }
8373 if (c != '%') {
8374 v = getnextarg(args, arglen, &argidx);
8375 if (v == NULL)
8376 goto onError;
8377 }
8378 sign = 0;
8379 fill = ' ';
8380 switch (c) {
8381
8382 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008383 pbuf = formatbuf;
8384 /* presume that buffer length is at least 1 */
8385 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386 len = 1;
8387 break;
8388
8389 case 's':
8390 case 'r':
8391 if (PyUnicode_Check(v) && c == 's') {
8392 temp = v;
8393 Py_INCREF(temp);
8394 }
8395 else {
8396 PyObject *unicode;
8397 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008398 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399 else
8400 temp = PyObject_Repr(v);
8401 if (temp == NULL)
8402 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008403 if (PyUnicode_Check(temp))
8404 /* nothing to do */;
8405 else if (PyString_Check(temp)) {
8406 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008407 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008408 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008409 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008411 Py_DECREF(temp);
8412 temp = unicode;
8413 if (temp == NULL)
8414 goto onError;
8415 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008416 else {
8417 Py_DECREF(temp);
8418 PyErr_SetString(PyExc_TypeError,
8419 "%s argument has non-string str()");
8420 goto onError;
8421 }
8422 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008423 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008424 len = PyUnicode_GET_SIZE(temp);
8425 if (prec >= 0 && len > prec)
8426 len = prec;
8427 break;
8428
8429 case 'i':
8430 case 'd':
8431 case 'u':
8432 case 'o':
8433 case 'x':
8434 case 'X':
8435 if (c == 'i')
8436 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008437 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008438 temp = formatlong(v, flags, prec, c);
8439 if (!temp)
8440 goto onError;
8441 pbuf = PyUnicode_AS_UNICODE(temp);
8442 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008443 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008445 else {
8446 pbuf = formatbuf;
8447 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8448 flags, prec, c, v);
8449 if (len < 0)
8450 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008451 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008452 }
8453 if (flags & F_ZERO)
8454 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455 break;
8456
8457 case 'e':
8458 case 'E':
8459 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008460 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461 case 'g':
8462 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008463 if (c == 'F')
8464 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008465 pbuf = formatbuf;
8466 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8467 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468 if (len < 0)
8469 goto onError;
8470 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008471 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472 fill = '0';
8473 break;
8474
8475 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008476 pbuf = formatbuf;
8477 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478 if (len < 0)
8479 goto onError;
8480 break;
8481
8482 default:
8483 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008484 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008485 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008486 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008487 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008488 (Py_ssize_t)(fmt - 1 -
8489 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008490 goto onError;
8491 }
8492 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008493 if (*pbuf == '-' || *pbuf == '+') {
8494 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495 len--;
8496 }
8497 else if (flags & F_SIGN)
8498 sign = '+';
8499 else if (flags & F_BLANK)
8500 sign = ' ';
8501 else
8502 sign = 0;
8503 }
8504 if (width < len)
8505 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008506 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507 reslen -= rescnt;
8508 rescnt = width + fmtcnt + 100;
8509 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008510 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008511 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008512 PyErr_NoMemory();
8513 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008514 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008515 if (_PyUnicode_Resize(&result, reslen) < 0) {
8516 Py_XDECREF(temp);
8517 goto onError;
8518 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519 res = PyUnicode_AS_UNICODE(result)
8520 + reslen - rescnt;
8521 }
8522 if (sign) {
8523 if (fill != ' ')
8524 *res++ = sign;
8525 rescnt--;
8526 if (width > len)
8527 width--;
8528 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008529 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008530 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008531 assert(pbuf[1] == c);
8532 if (fill != ' ') {
8533 *res++ = *pbuf++;
8534 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008535 }
Tim Petersfff53252001-04-12 18:38:48 +00008536 rescnt -= 2;
8537 width -= 2;
8538 if (width < 0)
8539 width = 0;
8540 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008541 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542 if (width > len && !(flags & F_LJUST)) {
8543 do {
8544 --rescnt;
8545 *res++ = fill;
8546 } while (--width > len);
8547 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008548 if (fill == ' ') {
8549 if (sign)
8550 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008551 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008552 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008553 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008554 *res++ = *pbuf++;
8555 *res++ = *pbuf++;
8556 }
8557 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008558 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559 res += len;
8560 rescnt -= len;
8561 while (--width >= len) {
8562 --rescnt;
8563 *res++ = ' ';
8564 }
8565 if (dict && (argidx < arglen) && c != '%') {
8566 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008567 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008568 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569 goto onError;
8570 }
8571 Py_XDECREF(temp);
8572 } /* '%' */
8573 } /* until end */
8574 if (argidx < arglen && !dict) {
8575 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008576 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577 goto onError;
8578 }
8579
Thomas Woutersa96affe2006-03-12 00:29:36 +00008580 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8581 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582 if (args_owned) {
8583 Py_DECREF(args);
8584 }
8585 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586 return (PyObject *)result;
8587
8588 onError:
8589 Py_XDECREF(result);
8590 Py_DECREF(uformat);
8591 if (args_owned) {
8592 Py_DECREF(args);
8593 }
8594 return NULL;
8595}
8596
8597static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008598 (readbufferproc) unicode_buffer_getreadbuf,
8599 (writebufferproc) unicode_buffer_getwritebuf,
8600 (segcountproc) unicode_buffer_getsegcount,
8601 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602};
8603
Jeremy Hylton938ace62002-07-17 16:30:39 +00008604static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008605unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8606
Tim Peters6d6c1a32001-08-02 04:15:00 +00008607static PyObject *
8608unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8609{
8610 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008611 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008612 char *encoding = NULL;
8613 char *errors = NULL;
8614
Guido van Rossume023fe02001-08-30 03:12:59 +00008615 if (type != &PyUnicode_Type)
8616 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008617 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8618 kwlist, &x, &encoding, &errors))
8619 return NULL;
8620 if (x == NULL)
8621 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008622 if (encoding == NULL && errors == NULL)
8623 return PyObject_Unicode(x);
8624 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008625 return PyUnicode_FromEncodedObject(x, encoding, errors);
8626}
8627
Guido van Rossume023fe02001-08-30 03:12:59 +00008628static PyObject *
8629unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8630{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008631 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008632 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008633
8634 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8635 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8636 if (tmp == NULL)
8637 return NULL;
8638 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008639 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008640 if (pnew == NULL) {
8641 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008642 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008643 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008644 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8645 if (pnew->str == NULL) {
8646 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008647 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008648 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008649 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008650 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008651 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8652 pnew->length = n;
8653 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008654 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008655 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008656}
8657
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008658PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008659"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008660\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008661Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008662encoding defaults to the current default string encoding.\n\
8663errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008664
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008665static PyObject *unicode_iter(PyObject *seq);
8666
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008668 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008669 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670 sizeof(PyUnicodeObject), /* tp_size */
8671 0, /* tp_itemsize */
8672 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008673 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008675 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008677 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008678 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008679 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008681 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 (hashfunc) unicode_hash, /* tp_hash*/
8683 0, /* tp_call*/
8684 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008685 PyObject_GenericGetAttr, /* tp_getattro */
8686 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008688 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8689 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008690 unicode_doc, /* tp_doc */
8691 0, /* tp_traverse */
8692 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008693 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008694 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008695 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008696 0, /* tp_iternext */
8697 unicode_methods, /* tp_methods */
8698 0, /* tp_members */
8699 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008700 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008701 0, /* tp_dict */
8702 0, /* tp_descr_get */
8703 0, /* tp_descr_set */
8704 0, /* tp_dictoffset */
8705 0, /* tp_init */
8706 0, /* tp_alloc */
8707 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008708 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709};
8710
8711/* Initialize the Unicode implementation */
8712
Thomas Wouters78890102000-07-22 19:25:51 +00008713void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008715 int i;
8716
Thomas Wouters477c8d52006-05-27 19:21:47 +00008717 /* XXX - move this array to unicodectype.c ? */
8718 Py_UNICODE linebreak[] = {
8719 0x000A, /* LINE FEED */
8720 0x000D, /* CARRIAGE RETURN */
8721 0x001C, /* FILE SEPARATOR */
8722 0x001D, /* GROUP SEPARATOR */
8723 0x001E, /* RECORD SEPARATOR */
8724 0x0085, /* NEXT LINE */
8725 0x2028, /* LINE SEPARATOR */
8726 0x2029, /* PARAGRAPH SEPARATOR */
8727 };
8728
Fred Drakee4315f52000-05-09 19:53:39 +00008729 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008730 unicode_freelist = NULL;
8731 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008732 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008733 if (!unicode_empty)
8734 return;
8735
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008736 for (i = 0; i < 256; i++)
8737 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008738 if (PyType_Ready(&PyUnicode_Type) < 0)
8739 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008740
8741 /* initialize the linebreak bloom filter */
8742 bloom_linebreak = make_bloom_mask(
8743 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8744 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008745
8746 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747}
8748
8749/* Finalize the Unicode implementation */
8750
8751void
Thomas Wouters78890102000-07-22 19:25:51 +00008752_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008754 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008755 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008757 Py_XDECREF(unicode_empty);
8758 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008759
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008760 for (i = 0; i < 256; i++) {
8761 if (unicode_latin1[i]) {
8762 Py_DECREF(unicode_latin1[i]);
8763 unicode_latin1[i] = NULL;
8764 }
8765 }
8766
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008767 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768 PyUnicodeObject *v = u;
8769 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008770 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008771 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008772 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008773 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008775 unicode_freelist = NULL;
8776 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008778
Walter Dörwald16807132007-05-25 13:52:07 +00008779void
8780PyUnicode_InternInPlace(PyObject **p)
8781{
8782 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8783 PyObject *t;
8784 if (s == NULL || !PyUnicode_Check(s))
8785 Py_FatalError(
8786 "PyUnicode_InternInPlace: unicode strings only please!");
8787 /* If it's a subclass, we don't really know what putting
8788 it in the interned dict might do. */
8789 if (!PyUnicode_CheckExact(s))
8790 return;
8791 if (PyUnicode_CHECK_INTERNED(s))
8792 return;
8793 if (interned == NULL) {
8794 interned = PyDict_New();
8795 if (interned == NULL) {
8796 PyErr_Clear(); /* Don't leave an exception */
8797 return;
8798 }
8799 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008800 /* It might be that the GetItem call fails even
8801 though the key is present in the dictionary,
8802 namely when this happens during a stack overflow. */
8803 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00008804 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00008805 Py_END_ALLOW_RECURSION
8806
Walter Dörwald16807132007-05-25 13:52:07 +00008807 if (t) {
8808 Py_INCREF(t);
8809 Py_DECREF(*p);
8810 *p = t;
8811 return;
8812 }
8813
Martin v. Löwis5b222132007-06-10 09:51:05 +00008814 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008815 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
8816 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00008817 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008818 return;
8819 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008820 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008821 /* The two references in interned are not counted by refcnt.
8822 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008823 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00008824 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
8825}
8826
8827void
8828PyUnicode_InternImmortal(PyObject **p)
8829{
8830 PyUnicode_InternInPlace(p);
8831 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
8832 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
8833 Py_INCREF(*p);
8834 }
8835}
8836
8837PyObject *
8838PyUnicode_InternFromString(const char *cp)
8839{
8840 PyObject *s = PyUnicode_FromString(cp);
8841 if (s == NULL)
8842 return NULL;
8843 PyUnicode_InternInPlace(&s);
8844 return s;
8845}
8846
8847void _Py_ReleaseInternedUnicodeStrings(void)
8848{
8849 PyObject *keys;
8850 PyUnicodeObject *s;
8851 Py_ssize_t i, n;
8852 Py_ssize_t immortal_size = 0, mortal_size = 0;
8853
8854 if (interned == NULL || !PyDict_Check(interned))
8855 return;
8856 keys = PyDict_Keys(interned);
8857 if (keys == NULL || !PyList_Check(keys)) {
8858 PyErr_Clear();
8859 return;
8860 }
8861
8862 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
8863 detector, interned unicode strings are not forcibly deallocated;
8864 rather, we give them their stolen references back, and then clear
8865 and DECREF the interned dict. */
8866
8867 n = PyList_GET_SIZE(keys);
8868 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
8869 n);
8870 for (i = 0; i < n; i++) {
8871 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
8872 switch (s->state) {
8873 case SSTATE_NOT_INTERNED:
8874 /* XXX Shouldn't happen */
8875 break;
8876 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008877 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008878 immortal_size += s->length;
8879 break;
8880 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008881 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00008882 mortal_size += s->length;
8883 break;
8884 default:
8885 Py_FatalError("Inconsistent interned string state.");
8886 }
8887 s->state = SSTATE_NOT_INTERNED;
8888 }
8889 fprintf(stderr, "total size of all interned strings: "
8890 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
8891 "mortal/immortal\n", mortal_size, immortal_size);
8892 Py_DECREF(keys);
8893 PyDict_Clear(interned);
8894 Py_DECREF(interned);
8895 interned = NULL;
8896}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008897
8898
8899/********************* Unicode Iterator **************************/
8900
8901typedef struct {
8902 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008903 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008904 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8905} unicodeiterobject;
8906
8907static void
8908unicodeiter_dealloc(unicodeiterobject *it)
8909{
8910 _PyObject_GC_UNTRACK(it);
8911 Py_XDECREF(it->it_seq);
8912 PyObject_GC_Del(it);
8913}
8914
8915static int
8916unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8917{
8918 Py_VISIT(it->it_seq);
8919 return 0;
8920}
8921
8922static PyObject *
8923unicodeiter_next(unicodeiterobject *it)
8924{
8925 PyUnicodeObject *seq;
8926 PyObject *item;
8927
8928 assert(it != NULL);
8929 seq = it->it_seq;
8930 if (seq == NULL)
8931 return NULL;
8932 assert(PyUnicode_Check(seq));
8933
8934 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008935 item = PyUnicode_FromUnicode(
8936 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008937 if (item != NULL)
8938 ++it->it_index;
8939 return item;
8940 }
8941
8942 Py_DECREF(seq);
8943 it->it_seq = NULL;
8944 return NULL;
8945}
8946
8947static PyObject *
8948unicodeiter_len(unicodeiterobject *it)
8949{
8950 Py_ssize_t len = 0;
8951 if (it->it_seq)
8952 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8953 return PyInt_FromSsize_t(len);
8954}
8955
8956PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8957
8958static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008959 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8960 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008961 {NULL, NULL} /* sentinel */
8962};
8963
8964PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008965 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008966 "unicodeiterator", /* tp_name */
8967 sizeof(unicodeiterobject), /* tp_basicsize */
8968 0, /* tp_itemsize */
8969 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008970 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008971 0, /* tp_print */
8972 0, /* tp_getattr */
8973 0, /* tp_setattr */
8974 0, /* tp_compare */
8975 0, /* tp_repr */
8976 0, /* tp_as_number */
8977 0, /* tp_as_sequence */
8978 0, /* tp_as_mapping */
8979 0, /* tp_hash */
8980 0, /* tp_call */
8981 0, /* tp_str */
8982 PyObject_GenericGetAttr, /* tp_getattro */
8983 0, /* tp_setattro */
8984 0, /* tp_as_buffer */
8985 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8986 0, /* tp_doc */
8987 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8988 0, /* tp_clear */
8989 0, /* tp_richcompare */
8990 0, /* tp_weaklistoffset */
8991 PyObject_SelfIter, /* tp_iter */
8992 (iternextfunc)unicodeiter_next, /* tp_iternext */
8993 unicodeiter_methods, /* tp_methods */
8994 0,
8995};
8996
8997static PyObject *
8998unicode_iter(PyObject *seq)
8999{
9000 unicodeiterobject *it;
9001
9002 if (!PyUnicode_Check(seq)) {
9003 PyErr_BadInternalCall();
9004 return NULL;
9005 }
9006 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9007 if (it == NULL)
9008 return NULL;
9009 it->it_index = 0;
9010 Py_INCREF(seq);
9011 it->it_seq = (PyUnicodeObject *)seq;
9012 _PyObject_GC_TRACK(it);
9013 return (PyObject *)it;
9014}
9015
Martin v. Löwis5b222132007-06-10 09:51:05 +00009016size_t
9017Py_UNICODE_strlen(const Py_UNICODE *u)
9018{
9019 int res = 0;
9020 while(*u++)
9021 res++;
9022 return res;
9023}
9024
9025Py_UNICODE*
9026Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9027{
9028 Py_UNICODE *u = s1;
9029 while ((*u++ = *s2++));
9030 return s1;
9031}
9032
9033Py_UNICODE*
9034Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9035{
9036 Py_UNICODE *u = s1;
9037 while ((*u++ = *s2++))
9038 if (n-- == 0)
9039 break;
9040 return s1;
9041}
9042
9043int
9044Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9045{
9046 while (*s1 && *s2 && *s1 == *s2)
9047 s1++, s2++;
9048 if (*s1 && *s2)
9049 return (*s1 < *s2) ? -1 : +1;
9050 if (*s1)
9051 return 1;
9052 if (*s2)
9053 return -1;
9054 return 0;
9055}
9056
9057Py_UNICODE*
9058Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9059{
9060 const Py_UNICODE *p;
9061 for (p = s; *p; p++)
9062 if (*p == c)
9063 return (Py_UNICODE*)p;
9064 return NULL;
9065}
9066
9067
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009068#ifdef __cplusplus
9069}
9070#endif
9071
9072
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009073/*
9074Local variables:
9075c-basic-offset: 4
9076indent-tabs-mode: nil
9077End:
9078*/