blob: e227fc72d2764fcd098acbd567a745b0fdc2ed2d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
101 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000106static PyUnicodeObject *unicode_freelist;
107static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Fred Drakee4315f52000-05-09 19:53:39 +0000116/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000117 parameter; it is fixed to "utf-8". Always use the
118 PyUnicode_GetDefaultEncoding() API to access this global. */
119static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000120
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000121Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000122PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000124#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125 return 0x10FFFF;
126#else
127 /* This is actually an illegal character, so it should
128 not be passed to unichr. */
129 return 0xFFFF;
130#endif
131}
132
Thomas Wouters477c8d52006-05-27 19:21:47 +0000133/* --- Bloom Filters ----------------------------------------------------- */
134
135/* stuff to implement simple "bloom filters" for Unicode characters.
136 to keep things simple, we use a single bitmask, using the least 5
137 bits from each unicode characters as the bit index. */
138
139/* the linebreak mask is set up by Unicode_Init below */
140
141#define BLOOM_MASK unsigned long
142
143static BLOOM_MASK bloom_linebreak;
144
145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
146
147#define BLOOM_LINEBREAK(ch)\
148 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
149
150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
151{
152 /* calculate simple bloom-style bitmask for a given unicode string */
153
154 long mask;
155 Py_ssize_t i;
156
157 mask = 0;
158 for (i = 0; i < len; i++)
159 mask |= (1 << (ptr[i] & 0x1F));
160
161 return mask;
162}
163
164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
165{
166 Py_ssize_t i;
167
168 for (i = 0; i < setlen; i++)
169 if (set[i] == chr)
170 return 1;
171
172 return 0;
173}
174
175#define BLOOM_MEMBER(mask, chr, set, setlen)\
176 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
177
Guido van Rossumd57fd912000-03-10 22:53:23 +0000178/* --- Unicode Object ----------------------------------------------------- */
179
180static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000182 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183{
184 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000185
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000186 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190 /* Resizing shared object (unicode_empty or single character
191 objects) in-place is not allowed. Use PyUnicode_Resize()
192 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000194 if (unicode == unicode_empty ||
195 (unicode->length == 1 &&
196 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 return -1;
201 }
202
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203 /* We allocate one more byte to make sure the string is Ux0000 terminated.
204 The overallocation is also used by fastsearch, which assumes that it's
205 safe to look at str[length] (without making any assumptions about what
206 it contains). */
207
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 oldstr = unicode->str;
209 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
210 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 PyErr_NoMemory();
213 return -1;
214 }
215 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000216 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000220 if (unicode->defenc) {
221 Py_DECREF(unicode->defenc);
222 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 }
224 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000225
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 return 0;
227}
228
229/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000230 Ux0000 terminated; some code (e.g. new_identifier)
231 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232
233 XXX This allocator could further be enhanced by assuring that the
234 free list never reduces its size below 1.
235
236*/
237
238static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000239PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000240{
241 register PyUnicodeObject *unicode;
242
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 if (length == 0 && unicode_empty != NULL) {
245 Py_INCREF(unicode_empty);
246 return unicode_empty;
247 }
248
249 /* Unicode freelist & memory allocation */
250 if (unicode_freelist) {
251 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000252 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000255 /* Keep-Alive optimization: we only upsize the buffer,
256 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000257 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000258 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000259 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000260 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 }
262 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000263 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000265 }
266 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 }
268 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000269 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000270 if (unicode == NULL)
271 return NULL;
272 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
273 }
274
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000275 if (!unicode->str) {
276 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000277 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000278 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000280 * the caller fails before initializing str -- unicode_resize()
281 * reads str[0], and the Keep-Alive optimization can keep memory
282 * allocated for str alive across a call to unicode_dealloc(unicode).
283 * We don't want unicode_resize to read uninitialized memory in
284 * that case.
285 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000286 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000288 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000290 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000291 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000293
294 onError:
295 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000296 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000297 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298}
299
300static
Guido van Rossum9475a232001-10-05 20:51:39 +0000301void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302{
Walter Dörwald16807132007-05-25 13:52:07 +0000303 switch (PyUnicode_CHECK_INTERNED(unicode)) {
304 case SSTATE_NOT_INTERNED:
305 break;
306
307 case SSTATE_INTERNED_MORTAL:
308 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000309 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000310 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
311 Py_FatalError(
312 "deletion of interned unicode string failed");
313 break;
314
315 case SSTATE_INTERNED_IMMORTAL:
316 Py_FatalError("Immortal interned unicode string died.");
317
318 default:
319 Py_FatalError("Inconsistent interned unicode string state.");
320 }
321
Guido van Rossum604ddf82001-12-06 20:03:56 +0000322 if (PyUnicode_CheckExact(unicode) &&
323 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000324 /* Keep-Alive optimization */
325 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000326 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327 unicode->str = NULL;
328 unicode->length = 0;
329 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000330 if (unicode->defenc) {
331 Py_DECREF(unicode->defenc);
332 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000333 }
334 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000335 *(PyUnicodeObject **)unicode = unicode_freelist;
336 unicode_freelist = unicode;
337 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 }
339 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000340 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000341 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000342 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 }
344}
345
Martin v. Löwis18e16552006-02-15 17:27:45 +0000346int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000347{
348 register PyUnicodeObject *v;
349
350 /* Argument checks */
351 if (unicode == NULL) {
352 PyErr_BadInternalCall();
353 return -1;
354 }
355 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000356 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000357 PyErr_BadInternalCall();
358 return -1;
359 }
360
361 /* Resizing unicode_empty and single character objects is not
362 possible since these are being shared. We simply return a fresh
363 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000364 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000365 (v == unicode_empty || v->length == 1)) {
366 PyUnicodeObject *w = _PyUnicode_New(length);
367 if (w == NULL)
368 return -1;
369 Py_UNICODE_COPY(w->str, v->str,
370 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000371 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000372 *unicode = (PyObject *)w;
373 return 0;
374 }
375
376 /* Note that we don't have to modify *unicode for unshared Unicode
377 objects, since we can modify them in-place. */
378 return unicode_resize(v, length);
379}
380
381/* Internal API for use in unicodeobject.c only ! */
382#define _PyUnicode_Resize(unicodevar, length) \
383 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000386 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387{
388 PyUnicodeObject *unicode;
389
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000390 /* If the Unicode data is known at construction time, we can apply
391 some optimizations which share commonly used objects. */
392 if (u != NULL) {
393
394 /* Optimization for empty strings */
395 if (size == 0 && unicode_empty != NULL) {
396 Py_INCREF(unicode_empty);
397 return (PyObject *)unicode_empty;
398 }
399
400 /* Single character Unicode objects in the Latin-1 range are
401 shared when using this constructor */
402 if (size == 1 && *u < 256) {
403 unicode = unicode_latin1[*u];
404 if (!unicode) {
405 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000406 if (!unicode)
407 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000408 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000409 unicode_latin1[*u] = unicode;
410 }
411 Py_INCREF(unicode);
412 return (PyObject *)unicode;
413 }
414 }
Tim Petersced69f82003-09-16 20:30:58 +0000415
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 unicode = _PyUnicode_New(size);
417 if (!unicode)
418 return NULL;
419
420 /* Copy the Unicode data into the new object */
421 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423
424 return (PyObject *)unicode;
425}
426
Walter Dörwaldd2034312007-05-18 16:29:38 +0000427PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000428{
429 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000430 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000431 some optimizations which share commonly used objects.
432 Also, this means the input must be UTF-8, so fall back to the
433 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000434 if (u != NULL) {
435
436 /* Optimization for empty strings */
437 if (size == 0 && unicode_empty != NULL) {
438 Py_INCREF(unicode_empty);
439 return (PyObject *)unicode_empty;
440 }
441
Martin v. Löwis9c121062007-08-05 20:26:11 +0000442 /* Single characters are shared when using this constructor.
443 Restrict to ASCII, since the input must be UTF-8. */
444 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000445 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000446 if (!unicode) {
447 unicode = _PyUnicode_New(1);
448 if (!unicode)
449 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000450 unicode->str[0] = Py_CHARMASK(*u);
451 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000452 }
453 Py_INCREF(unicode);
454 return (PyObject *)unicode;
455 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000456
457 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000458 }
459
Walter Dörwald55507312007-05-18 13:12:10 +0000460 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000461 if (!unicode)
462 return NULL;
463
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000464 return (PyObject *)unicode;
465}
466
Walter Dörwaldd2034312007-05-18 16:29:38 +0000467PyObject *PyUnicode_FromString(const char *u)
468{
469 size_t size = strlen(u);
470 if (size > PY_SSIZE_T_MAX) {
471 PyErr_SetString(PyExc_OverflowError, "input too long");
472 return NULL;
473 }
474
475 return PyUnicode_FromStringAndSize(u, size);
476}
477
Guido van Rossumd57fd912000-03-10 22:53:23 +0000478#ifdef HAVE_WCHAR_H
479
480PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000481 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000482{
483 PyUnicodeObject *unicode;
484
485 if (w == NULL) {
486 PyErr_BadInternalCall();
487 return NULL;
488 }
489
490 unicode = _PyUnicode_New(size);
491 if (!unicode)
492 return NULL;
493
494 /* Copy the wchar_t data into the new object */
495#ifdef HAVE_USABLE_WCHAR_T
496 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000497#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 {
499 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000500 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000502 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 *u++ = *w++;
504 }
505#endif
506
507 return (PyObject *)unicode;
508}
509
Walter Dörwald346737f2007-05-31 10:44:43 +0000510static void
511makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
512{
513 *fmt++ = '%';
514 if (width) {
515 if (zeropad)
516 *fmt++ = '0';
517 fmt += sprintf(fmt, "%d", width);
518 }
519 if (precision)
520 fmt += sprintf(fmt, ".%d", precision);
521 if (longflag)
522 *fmt++ = 'l';
523 else if (size_tflag) {
524 char *f = PY_FORMAT_SIZE_T;
525 while (*f)
526 *fmt++ = *f++;
527 }
528 *fmt++ = c;
529 *fmt = '\0';
530}
531
Walter Dörwaldd2034312007-05-18 16:29:38 +0000532#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
533
534PyObject *
535PyUnicode_FromFormatV(const char *format, va_list vargs)
536{
537 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000538 Py_ssize_t callcount = 0;
539 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000540 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000541 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000542 int width = 0;
543 int precision = 0;
544 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000545 const char* f;
546 Py_UNICODE *s;
547 PyObject *string;
548 /* used by sprintf */
549 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000550 /* use abuffer instead of buffer, if we need more space
551 * (which can happen if there's a format specifier with width). */
552 char *abuffer = NULL;
553 char *realbuffer;
554 Py_ssize_t abuffersize = 0;
555 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000556 const char *copy;
557
558#ifdef VA_LIST_IS_ARRAY
559 Py_MEMCPY(count, vargs, sizeof(va_list));
560#else
561#ifdef __va_copy
562 __va_copy(count, vargs);
563#else
564 count = vargs;
565#endif
566#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000567 /* step 1: count the number of %S/%R format specifications
568 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
569 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000570 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000571 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000572 ++callcount;
573 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000574 /* step 2: allocate memory for the results of
575 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000576 if (callcount) {
577 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
578 if (!callresults) {
579 PyErr_NoMemory();
580 return NULL;
581 }
582 callresult = callresults;
583 }
584 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000585 for (f = format; *f; f++) {
586 if (*f == '%') {
587 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000588 width = 0;
589 while (isdigit(Py_CHARMASK(*f)))
590 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000591 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
592 ;
593
594 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
595 * they don't affect the amount of space we reserve.
596 */
597 if ((*f == 'l' || *f == 'z') &&
598 (f[1] == 'd' || f[1] == 'u'))
599 ++f;
600
601 switch (*f) {
602 case 'c':
603 (void)va_arg(count, int);
604 /* fall through... */
605 case '%':
606 n++;
607 break;
608 case 'd': case 'u': case 'i': case 'x':
609 (void) va_arg(count, int);
610 /* 20 bytes is enough to hold a 64-bit
611 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000612 This isn't enough for octal.
613 If a width is specified we need more
614 (which we allocate later). */
615 if (width < 20)
616 width = 20;
617 n += width;
618 if (abuffersize < width)
619 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000620 break;
621 case 's':
622 n += strlen(va_arg(count, char*));
623 break;
624 case 'U':
625 {
626 PyObject *obj = va_arg(count, PyObject *);
627 assert(obj && PyUnicode_Check(obj));
628 n += PyUnicode_GET_SIZE(obj);
629 break;
630 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000631 case 'V':
632 {
633 PyObject *obj = va_arg(count, PyObject *);
634 const char *str = va_arg(count, const char *);
635 assert(obj || str);
636 assert(!obj || PyUnicode_Check(obj));
637 if (obj)
638 n += PyUnicode_GET_SIZE(obj);
639 else
640 n += strlen(str);
641 break;
642 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000643 case 'S':
644 {
645 PyObject *obj = va_arg(count, PyObject *);
646 PyObject *str;
647 assert(obj);
648 str = PyObject_Unicode(obj);
649 if (!str)
650 goto fail;
651 n += PyUnicode_GET_SIZE(str);
652 /* Remember the str and switch to the next slot */
653 *callresult++ = str;
654 break;
655 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000656 case 'R':
657 {
658 PyObject *obj = va_arg(count, PyObject *);
659 PyObject *repr;
660 assert(obj);
661 repr = PyObject_Repr(obj);
662 if (!repr)
663 goto fail;
664 n += PyUnicode_GET_SIZE(repr);
665 /* Remember the repr and switch to the next slot */
666 *callresult++ = repr;
667 break;
668 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000669 case 'p':
670 (void) va_arg(count, int);
671 /* maximum 64-bit pointer representation:
672 * 0xffffffffffffffff
673 * so 19 characters is enough.
674 * XXX I count 18 -- what's the extra for?
675 */
676 n += 19;
677 break;
678 default:
679 /* if we stumble upon an unknown
680 formatting code, copy the rest of
681 the format string to the output
682 string. (we cannot just skip the
683 code, since there's no way to know
684 what's in the argument list) */
685 n += strlen(p);
686 goto expand;
687 }
688 } else
689 n++;
690 }
691 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000692 if (abuffersize > 20) {
693 abuffer = PyMem_Malloc(abuffersize);
694 if (!abuffer) {
695 PyErr_NoMemory();
696 goto fail;
697 }
698 realbuffer = abuffer;
699 }
700 else
701 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000702 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000703 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000704 we don't have to resize the string.
705 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000706 string = PyUnicode_FromUnicode(NULL, n);
707 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000708 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000709
710 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000711 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000712
713 for (f = format; *f; f++) {
714 if (*f == '%') {
715 const char* p = f++;
716 int longflag = 0;
717 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000718 zeropad = (*f == '0');
719 /* parse the width.precision part */
720 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000721 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000722 width = (width*10) + *f++ - '0';
723 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724 if (*f == '.') {
725 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000726 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000727 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000729 /* handle the long flag, but only for %ld and %lu.
730 others can be added when necessary. */
731 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
732 longflag = 1;
733 ++f;
734 }
735 /* handle the size_t flag. */
736 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
737 size_tflag = 1;
738 ++f;
739 }
740
741 switch (*f) {
742 case 'c':
743 *s++ = va_arg(vargs, int);
744 break;
745 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000746 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000747 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000748 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000749 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000750 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000751 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000752 sprintf(realbuffer, fmt, va_arg(vargs, int));
753 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000754 break;
755 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000756 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000758 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000760 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000761 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000762 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
763 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000764 break;
765 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000766 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
767 sprintf(realbuffer, fmt, va_arg(vargs, int));
768 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000769 break;
770 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000771 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
772 sprintf(realbuffer, fmt, va_arg(vargs, int));
773 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000774 break;
775 case 's':
776 p = va_arg(vargs, char*);
777 appendstring(p);
778 break;
779 case 'U':
780 {
781 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000782 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
783 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
784 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000785 break;
786 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000787 case 'V':
788 {
789 PyObject *obj = va_arg(vargs, PyObject *);
790 const char *str = va_arg(vargs, const char *);
791 if (obj) {
792 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
793 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
794 s += size;
795 } else {
796 appendstring(str);
797 }
798 break;
799 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000800 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000801 case 'R':
802 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000803 Py_UNICODE *ucopy;
804 Py_ssize_t usize;
805 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000806 /* unused, since we already have the result */
807 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000808 ucopy = PyUnicode_AS_UNICODE(*callresult);
809 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000810 for (upos = 0; upos<usize;)
811 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000812 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000813 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000814 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000815 ++callresult;
816 break;
817 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000818 case 'p':
819 sprintf(buffer, "%p", va_arg(vargs, void*));
820 /* %p is ill-defined: ensure leading 0x. */
821 if (buffer[1] == 'X')
822 buffer[1] = 'x';
823 else if (buffer[1] != 'x') {
824 memmove(buffer+2, buffer, strlen(buffer)+1);
825 buffer[0] = '0';
826 buffer[1] = 'x';
827 }
828 appendstring(buffer);
829 break;
830 case '%':
831 *s++ = '%';
832 break;
833 default:
834 appendstring(p);
835 goto end;
836 }
837 } else
838 *s++ = *f;
839 }
840
841 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000842 if (callresults)
843 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000844 if (abuffer)
845 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000846 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
847 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000848 fail:
849 if (callresults) {
850 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000851 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000852 Py_DECREF(*callresult2);
853 ++callresult2;
854 }
855 PyMem_Free(callresults);
856 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000857 if (abuffer)
858 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000859 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000860}
861
862#undef appendstring
863
864PyObject *
865PyUnicode_FromFormat(const char *format, ...)
866{
867 PyObject* ret;
868 va_list vargs;
869
870#ifdef HAVE_STDARG_PROTOTYPES
871 va_start(vargs, format);
872#else
873 va_start(vargs);
874#endif
875 ret = PyUnicode_FromFormatV(format, vargs);
876 va_end(vargs);
877 return ret;
878}
879
Martin v. Löwis18e16552006-02-15 17:27:45 +0000880Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
881 wchar_t *w,
882 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883{
884 if (unicode == NULL) {
885 PyErr_BadInternalCall();
886 return -1;
887 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000888
889 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000891 size = PyUnicode_GET_SIZE(unicode) + 1;
892
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893#ifdef HAVE_USABLE_WCHAR_T
894 memcpy(w, unicode->str, size * sizeof(wchar_t));
895#else
896 {
897 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000898 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000899 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000900 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000901 *w++ = *u++;
902 }
903#endif
904
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000905 if (size > PyUnicode_GET_SIZE(unicode))
906 return PyUnicode_GET_SIZE(unicode);
907 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000908 return size;
909}
910
911#endif
912
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000913PyObject *PyUnicode_FromOrdinal(int ordinal)
914{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000915 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000916
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000917 if (ordinal < 0 || ordinal > 0x10ffff) {
918 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000919 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000920 return NULL;
921 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000922
923#ifndef Py_UNICODE_WIDE
924 if (ordinal > 0xffff) {
925 ordinal -= 0x10000;
926 s[0] = 0xD800 | (ordinal >> 10);
927 s[1] = 0xDC00 | (ordinal & 0x3FF);
928 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000929 }
930#endif
931
Hye-Shik Chang40574832004-04-06 07:24:51 +0000932 s[0] = (Py_UNICODE)ordinal;
933 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000934}
935
Guido van Rossumd57fd912000-03-10 22:53:23 +0000936PyObject *PyUnicode_FromObject(register PyObject *obj)
937{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000938 /* XXX Perhaps we should make this API an alias of
939 PyObject_Unicode() instead ?! */
940 if (PyUnicode_CheckExact(obj)) {
941 Py_INCREF(obj);
942 return obj;
943 }
944 if (PyUnicode_Check(obj)) {
945 /* For a Unicode subtype that's not a Unicode object,
946 return a true Unicode object with the same data. */
947 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
948 PyUnicode_GET_SIZE(obj));
949 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000950 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
951}
952
953PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
954 const char *encoding,
955 const char *errors)
956{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000957 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000958 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000959 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000960
Guido van Rossumd57fd912000-03-10 22:53:23 +0000961 if (obj == NULL) {
962 PyErr_BadInternalCall();
963 return NULL;
964 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000965
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000966#if 0
967 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000968 that no encodings is given and then redirect to
969 PyObject_Unicode() which then applies the additional logic for
970 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000971
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000972 NOTE: This API should really only be used for object which
973 represent *encoded* Unicode !
974
975 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000976 if (PyUnicode_Check(obj)) {
977 if (encoding) {
978 PyErr_SetString(PyExc_TypeError,
979 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000980 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000981 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000982 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000983 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000984#else
985 if (PyUnicode_Check(obj)) {
986 PyErr_SetString(PyExc_TypeError,
987 "decoding Unicode is not supported");
988 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000989 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000990#endif
991
992 /* Coerce object */
993 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000994 s = PyString_AS_STRING(obj);
995 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000996 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000997 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
998 /* Overwrite the error message with something more useful in
999 case of a TypeError. */
1000 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001001 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001002 "coercing to Unicode: need string or buffer, "
1003 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001004 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001005 goto onError;
1006 }
Tim Petersced69f82003-09-16 20:30:58 +00001007
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001008 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001009 if (len == 0) {
1010 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001011 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001012 }
Tim Petersced69f82003-09-16 20:30:58 +00001013 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001014 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001015
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001016 return v;
1017
1018 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001019 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001020}
1021
1022PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001023 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024 const char *encoding,
1025 const char *errors)
1026{
1027 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001028
1029 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001030 encoding = PyUnicode_GetDefaultEncoding();
1031
1032 /* Shortcuts for common default encodings */
1033 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001035 else if (strcmp(encoding, "latin-1") == 0)
1036 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001037#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1038 else if (strcmp(encoding, "mbcs") == 0)
1039 return PyUnicode_DecodeMBCS(s, size, errors);
1040#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001041 else if (strcmp(encoding, "ascii") == 0)
1042 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043
1044 /* Decode via the codec registry */
1045 buffer = PyBuffer_FromMemory((void *)s, size);
1046 if (buffer == NULL)
1047 goto onError;
1048 unicode = PyCodec_Decode(buffer, encoding, errors);
1049 if (unicode == NULL)
1050 goto onError;
1051 if (!PyUnicode_Check(unicode)) {
1052 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001053 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001054 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001055 Py_DECREF(unicode);
1056 goto onError;
1057 }
1058 Py_DECREF(buffer);
1059 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001060
Guido van Rossumd57fd912000-03-10 22:53:23 +00001061 onError:
1062 Py_XDECREF(buffer);
1063 return NULL;
1064}
1065
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001066PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1067 const char *encoding,
1068 const char *errors)
1069{
1070 PyObject *v;
1071
1072 if (!PyUnicode_Check(unicode)) {
1073 PyErr_BadArgument();
1074 goto onError;
1075 }
1076
1077 if (encoding == NULL)
1078 encoding = PyUnicode_GetDefaultEncoding();
1079
1080 /* Decode via the codec registry */
1081 v = PyCodec_Decode(unicode, encoding, errors);
1082 if (v == NULL)
1083 goto onError;
1084 return v;
1085
1086 onError:
1087 return NULL;
1088}
1089
Guido van Rossumd57fd912000-03-10 22:53:23 +00001090PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001091 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092 const char *encoding,
1093 const char *errors)
1094{
1095 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001096
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097 unicode = PyUnicode_FromUnicode(s, size);
1098 if (unicode == NULL)
1099 return NULL;
1100 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1101 Py_DECREF(unicode);
1102 return v;
1103}
1104
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001105PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1106 const char *encoding,
1107 const char *errors)
1108{
1109 PyObject *v;
1110
1111 if (!PyUnicode_Check(unicode)) {
1112 PyErr_BadArgument();
1113 goto onError;
1114 }
1115
1116 if (encoding == NULL)
1117 encoding = PyUnicode_GetDefaultEncoding();
1118
1119 /* Encode via the codec registry */
1120 v = PyCodec_Encode(unicode, encoding, errors);
1121 if (v == NULL)
1122 goto onError;
1123 return v;
1124
1125 onError:
1126 return NULL;
1127}
1128
Guido van Rossumd57fd912000-03-10 22:53:23 +00001129PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1130 const char *encoding,
1131 const char *errors)
1132{
1133 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001134
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135 if (!PyUnicode_Check(unicode)) {
1136 PyErr_BadArgument();
1137 goto onError;
1138 }
Fred Drakee4315f52000-05-09 19:53:39 +00001139
Tim Petersced69f82003-09-16 20:30:58 +00001140 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001141 encoding = PyUnicode_GetDefaultEncoding();
1142
1143 /* Shortcuts for common default encodings */
1144 if (errors == NULL) {
1145 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001146 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001147 else if (strcmp(encoding, "latin-1") == 0)
1148 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001149#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1150 else if (strcmp(encoding, "mbcs") == 0)
1151 return PyUnicode_AsMBCSString(unicode);
1152#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001153 else if (strcmp(encoding, "ascii") == 0)
1154 return PyUnicode_AsASCIIString(unicode);
1155 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156
1157 /* Encode via the codec registry */
1158 v = PyCodec_Encode(unicode, encoding, errors);
1159 if (v == NULL)
1160 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001161 if (!PyBytes_Check(v)) {
1162 if (PyString_Check(v)) {
1163 /* Old codec, turn it into bytes */
1164 PyObject *b = PyBytes_FromObject(v);
1165 Py_DECREF(v);
1166 return b;
1167 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001169 "encoder did not return a bytes object "
1170 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1171 v->ob_type->tp_name,
1172 encoding ? encoding : "NULL",
1173 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 Py_DECREF(v);
1175 goto onError;
1176 }
1177 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001178
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 onError:
1180 return NULL;
1181}
1182
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001183PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1184 const char *errors)
1185{
1186 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001187 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001188 if (v)
1189 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001190 if (errors != NULL)
1191 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum06610092007-08-16 21:02:22 +00001192 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1193 PyUnicode_GET_SIZE(unicode),
1194 NULL);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001195 if (!b)
1196 return NULL;
1197 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1198 PyBytes_Size(b));
1199 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001200 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001201 return v;
1202}
1203
Martin v. Löwis5b222132007-06-10 09:51:05 +00001204char*
1205PyUnicode_AsString(PyObject *unicode)
1206{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001207 if (!PyUnicode_Check(unicode)) {
1208 PyErr_BadArgument();
1209 return NULL;
1210 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001211 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1212 if (!unicode)
1213 return NULL;
1214 return PyString_AsString(unicode);
1215}
1216
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1218{
1219 if (!PyUnicode_Check(unicode)) {
1220 PyErr_BadArgument();
1221 goto onError;
1222 }
1223 return PyUnicode_AS_UNICODE(unicode);
1224
1225 onError:
1226 return NULL;
1227}
1228
Martin v. Löwis18e16552006-02-15 17:27:45 +00001229Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230{
1231 if (!PyUnicode_Check(unicode)) {
1232 PyErr_BadArgument();
1233 goto onError;
1234 }
1235 return PyUnicode_GET_SIZE(unicode);
1236
1237 onError:
1238 return -1;
1239}
1240
Thomas Wouters78890102000-07-22 19:25:51 +00001241const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001242{
1243 return unicode_default_encoding;
1244}
1245
1246int PyUnicode_SetDefaultEncoding(const char *encoding)
1247{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001248 if (strcmp(encoding, unicode_default_encoding) != 0) {
1249 PyErr_Format(PyExc_ValueError,
1250 "Can only set default encoding to %s",
1251 unicode_default_encoding);
1252 return -1;
1253 }
Fred Drakee4315f52000-05-09 19:53:39 +00001254 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001255}
1256
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001257/* error handling callback helper:
1258 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001259 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001260 and adjust various state variables.
1261 return 0 on success, -1 on error
1262*/
1263
1264static
1265int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1266 const char *encoding, const char *reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001267 const char **input, const char **inend, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001268 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001269{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001270 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001271
1272 PyObject *restuple = NULL;
1273 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001274 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001275 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001276 Py_ssize_t requiredsize;
1277 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001279 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001280 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001281 int res = -1;
1282
1283 if (*errorHandler == NULL) {
1284 *errorHandler = PyCodec_LookupError(errors);
1285 if (*errorHandler == NULL)
1286 goto onError;
1287 }
1288
1289 if (*exceptionObject == NULL) {
1290 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001291 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001292 if (*exceptionObject == NULL)
1293 goto onError;
1294 }
1295 else {
1296 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1297 goto onError;
1298 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1299 goto onError;
1300 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1301 goto onError;
1302 }
1303
1304 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1305 if (restuple == NULL)
1306 goto onError;
1307 if (!PyTuple_Check(restuple)) {
1308 PyErr_Format(PyExc_TypeError, &argparse[4]);
1309 goto onError;
1310 }
1311 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1312 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001313
1314 /* Copy back the bytes variables, which might have been modified by the
1315 callback */
1316 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1317 if (!inputobj)
1318 goto onError;
1319 if (!PyBytes_Check(inputobj)) {
1320 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1321 }
1322 *input = PyBytes_AS_STRING(inputobj);
1323 insize = PyBytes_GET_SIZE(inputobj);
1324 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001325 /* we can DECREF safely, as the exception has another reference,
1326 so the object won't go away. */
1327 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001328
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001329 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001330 newpos = insize+newpos;
1331 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001332 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001333 goto onError;
1334 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001335
1336 /* need more space? (at least enough for what we
1337 have+the replacement+the rest of the string (starting
1338 at the new input position), so we won't have to check space
1339 when there are no errors in the rest of the string) */
1340 repptr = PyUnicode_AS_UNICODE(repunicode);
1341 repsize = PyUnicode_GET_SIZE(repunicode);
1342 requiredsize = *outpos + repsize + insize-newpos;
1343 if (requiredsize > outsize) {
1344 if (requiredsize<2*outsize)
1345 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001346 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001347 goto onError;
1348 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1349 }
1350 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001351 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001352 Py_UNICODE_COPY(*outptr, repptr, repsize);
1353 *outptr += repsize;
1354 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001355
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001356 /* we made it! */
1357 res = 0;
1358
1359 onError:
1360 Py_XDECREF(restuple);
1361 return res;
1362}
1363
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001364/* --- UTF-7 Codec -------------------------------------------------------- */
1365
1366/* see RFC2152 for details */
1367
Tim Petersced69f82003-09-16 20:30:58 +00001368static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001369char utf7_special[128] = {
1370 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1371 encoded:
1372 0 - not special
1373 1 - special
1374 2 - whitespace (optional)
1375 3 - RFC2152 Set O (optional) */
1376 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1377 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1378 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1379 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1380 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1381 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1382 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1383 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1384
1385};
1386
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001387/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1388 warnings about the comparison always being false; since
1389 utf7_special[0] is 1, we can safely make that one comparison
1390 true */
1391
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001392#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001393 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001394 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001395 (encodeO && (utf7_special[(c)] == 3)))
1396
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001397#define B64(n) \
1398 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1399#define B64CHAR(c) \
1400 (isalnum(c) || (c) == '+' || (c) == '/')
1401#define UB64(c) \
1402 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1403 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001404
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001405#define ENCODE(out, ch, bits) \
1406 while (bits >= 6) { \
1407 *out++ = B64(ch >> (bits-6)); \
1408 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001409 }
1410
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001411#define DECODE(out, ch, bits, surrogate) \
1412 while (bits >= 16) { \
1413 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1414 bits -= 16; \
1415 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001416 /* We have already generated an error for the high surrogate \
1417 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001418 surrogate = 0; \
1419 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001420 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001421 it in a 16-bit character */ \
1422 surrogate = 1; \
1423 errmsg = "code pairs are not supported"; \
1424 goto utf7Error; \
1425 } else { \
1426 *out++ = outCh; \
1427 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001428 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001429
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001430PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001431 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001432 const char *errors)
1433{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001434 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001435 Py_ssize_t startinpos;
1436 Py_ssize_t endinpos;
1437 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001438 const char *e;
1439 PyUnicodeObject *unicode;
1440 Py_UNICODE *p;
1441 const char *errmsg = "";
1442 int inShift = 0;
1443 unsigned int bitsleft = 0;
1444 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445 int surrogate = 0;
1446 PyObject *errorHandler = NULL;
1447 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001448
1449 unicode = _PyUnicode_New(size);
1450 if (!unicode)
1451 return NULL;
1452 if (size == 0)
1453 return (PyObject *)unicode;
1454
1455 p = unicode->str;
1456 e = s + size;
1457
1458 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001459 Py_UNICODE ch;
1460 restart:
1461 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001462
1463 if (inShift) {
1464 if ((ch == '-') || !B64CHAR(ch)) {
1465 inShift = 0;
1466 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001467
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001468 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1469 if (bitsleft >= 6) {
1470 /* The shift sequence has a partial character in it. If
1471 bitsleft < 6 then we could just classify it as padding
1472 but that is not the case here */
1473
1474 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001475 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001476 }
1477 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001478 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001479 here so indicate the potential of a misencoded character. */
1480
1481 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1482 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1483 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001484 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001485 }
1486
1487 if (ch == '-') {
1488 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001489 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001490 inShift = 1;
1491 }
1492 } else if (SPECIAL(ch,0,0)) {
1493 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001494 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001495 } else {
1496 *p++ = ch;
1497 }
1498 } else {
1499 charsleft = (charsleft << 6) | UB64(ch);
1500 bitsleft += 6;
1501 s++;
1502 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1503 }
1504 }
1505 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001506 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001507 s++;
1508 if (s < e && *s == '-') {
1509 s++;
1510 *p++ = '+';
1511 } else
1512 {
1513 inShift = 1;
1514 bitsleft = 0;
1515 }
1516 }
1517 else if (SPECIAL(ch,0,0)) {
1518 errmsg = "unexpected special character";
1519 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001520 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001521 }
1522 else {
1523 *p++ = ch;
1524 s++;
1525 }
1526 continue;
1527 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001528 outpos = p-PyUnicode_AS_UNICODE(unicode);
1529 endinpos = s-starts;
1530 if (unicode_decode_call_errorhandler(
1531 errors, &errorHandler,
1532 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001533 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001534 (PyObject **)&unicode, &outpos, &p))
1535 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001536 }
1537
1538 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001539 outpos = p-PyUnicode_AS_UNICODE(unicode);
1540 endinpos = size;
1541 if (unicode_decode_call_errorhandler(
1542 errors, &errorHandler,
1543 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001544 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001545 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001546 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001547 if (s < e)
1548 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001549 }
1550
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001551 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001552 goto onError;
1553
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001554 Py_XDECREF(errorHandler);
1555 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001556 return (PyObject *)unicode;
1557
1558onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001559 Py_XDECREF(errorHandler);
1560 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001561 Py_DECREF(unicode);
1562 return NULL;
1563}
1564
1565
1566PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001567 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001568 int encodeSetO,
1569 int encodeWhiteSpace,
1570 const char *errors)
1571{
1572 PyObject *v;
1573 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001574 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001575 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001576 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001577 unsigned int bitsleft = 0;
1578 unsigned long charsleft = 0;
1579 char * out;
1580 char * start;
1581
1582 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001583 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001584
Walter Dörwald51ab4142007-05-05 14:43:36 +00001585 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001586 if (v == NULL)
1587 return NULL;
1588
Walter Dörwald51ab4142007-05-05 14:43:36 +00001589 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001590 for (;i < size; ++i) {
1591 Py_UNICODE ch = s[i];
1592
1593 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001594 if (ch == '+') {
1595 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001596 *out++ = '-';
1597 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1598 charsleft = ch;
1599 bitsleft = 16;
1600 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001601 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001602 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001603 } else {
1604 *out++ = (char) ch;
1605 }
1606 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001607 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1608 *out++ = B64(charsleft << (6-bitsleft));
1609 charsleft = 0;
1610 bitsleft = 0;
1611 /* Characters not in the BASE64 set implicitly unshift the sequence
1612 so no '-' is required, except if the character is itself a '-' */
1613 if (B64CHAR(ch) || ch == '-') {
1614 *out++ = '-';
1615 }
1616 inShift = 0;
1617 *out++ = (char) ch;
1618 } else {
1619 bitsleft += 16;
1620 charsleft = (charsleft << 16) | ch;
1621 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1622
1623 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001624 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001625 or '-' then the shift sequence will be terminated implicitly and we
1626 don't have to insert a '-'. */
1627
1628 if (bitsleft == 0) {
1629 if (i + 1 < size) {
1630 Py_UNICODE ch2 = s[i+1];
1631
1632 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001633
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001634 } else if (B64CHAR(ch2) || ch2 == '-') {
1635 *out++ = '-';
1636 inShift = 0;
1637 } else {
1638 inShift = 0;
1639 }
1640
1641 }
1642 else {
1643 *out++ = '-';
1644 inShift = 0;
1645 }
1646 }
Tim Petersced69f82003-09-16 20:30:58 +00001647 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001648 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001649 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650 if (bitsleft) {
1651 *out++= B64(charsleft << (6-bitsleft) );
1652 *out++ = '-';
1653 }
1654
Walter Dörwald51ab4142007-05-05 14:43:36 +00001655 if (PyBytes_Resize(v, out - start)) {
1656 Py_DECREF(v);
1657 return NULL;
1658 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001659 return v;
1660}
1661
1662#undef SPECIAL
1663#undef B64
1664#undef B64CHAR
1665#undef UB64
1666#undef ENCODE
1667#undef DECODE
1668
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669/* --- UTF-8 Codec -------------------------------------------------------- */
1670
Tim Petersced69f82003-09-16 20:30:58 +00001671static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001672char utf8_code_length[256] = {
1673 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1674 illegal prefix. see RFC 2279 for details */
1675 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1676 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1677 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1678 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1679 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1680 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1681 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1682 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1683 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1684 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1685 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1687 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1688 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1689 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1690 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1691};
1692
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001694 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001695 const char *errors)
1696{
Walter Dörwald69652032004-09-07 20:24:22 +00001697 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1698}
1699
1700PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001701 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001702 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001703 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001704{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001705 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001706 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001707 Py_ssize_t startinpos;
1708 Py_ssize_t endinpos;
1709 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710 const char *e;
1711 PyUnicodeObject *unicode;
1712 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001713 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001714 PyObject *errorHandler = NULL;
1715 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716
1717 /* Note: size will always be longer than the resulting Unicode
1718 character count */
1719 unicode = _PyUnicode_New(size);
1720 if (!unicode)
1721 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001722 if (size == 0) {
1723 if (consumed)
1724 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001726 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727
1728 /* Unpack UTF-8 encoded data */
1729 p = unicode->str;
1730 e = s + size;
1731
1732 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001733 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734
1735 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001736 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737 s++;
1738 continue;
1739 }
1740
1741 n = utf8_code_length[ch];
1742
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001743 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001744 if (consumed)
1745 break;
1746 else {
1747 errmsg = "unexpected end of data";
1748 startinpos = s-starts;
1749 endinpos = size;
1750 goto utf8Error;
1751 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001752 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753
1754 switch (n) {
1755
1756 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001757 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001758 startinpos = s-starts;
1759 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001760 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761
1762 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001763 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001764 startinpos = s-starts;
1765 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001766 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767
1768 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001769 if ((s[1] & 0xc0) != 0x80) {
1770 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001771 startinpos = s-starts;
1772 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001773 goto utf8Error;
1774 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001776 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001777 startinpos = s-starts;
1778 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001779 errmsg = "illegal encoding";
1780 goto utf8Error;
1781 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001783 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 break;
1785
1786 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001787 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001788 (s[2] & 0xc0) != 0x80) {
1789 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 startinpos = s-starts;
1791 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001792 goto utf8Error;
1793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001795 if (ch < 0x0800) {
1796 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001797 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001798
1799 XXX For wide builds (UCS-4) we should probably try
1800 to recombine the surrogates into a single code
1801 unit.
1802 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001803 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001804 startinpos = s-starts;
1805 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001806 goto utf8Error;
1807 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001809 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001810 break;
1811
1812 case 4:
1813 if ((s[1] & 0xc0) != 0x80 ||
1814 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001815 (s[3] & 0xc0) != 0x80) {
1816 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001817 startinpos = s-starts;
1818 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001819 goto utf8Error;
1820 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001821 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1822 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1823 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001824 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001825 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001826 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001827 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001828 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001829 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001830 startinpos = s-starts;
1831 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001832 goto utf8Error;
1833 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001834#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001835 *p++ = (Py_UNICODE)ch;
1836#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001837 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001838
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001839 /* translate from 10000..10FFFF to 0..FFFF */
1840 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001841
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001842 /* high surrogate = top 10 bits added to D800 */
1843 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001844
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001845 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001846 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001847#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 break;
1849
1850 default:
1851 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001852 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001853 startinpos = s-starts;
1854 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001855 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856 }
1857 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001858 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001859
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001860 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001861 outpos = p-PyUnicode_AS_UNICODE(unicode);
1862 if (unicode_decode_call_errorhandler(
1863 errors, &errorHandler,
1864 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001865 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001866 (PyObject **)&unicode, &outpos, &p))
1867 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868 }
Walter Dörwald69652032004-09-07 20:24:22 +00001869 if (consumed)
1870 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871
1872 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001873 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001874 goto onError;
1875
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001876 Py_XDECREF(errorHandler);
1877 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878 return (PyObject *)unicode;
1879
1880onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001881 Py_XDECREF(errorHandler);
1882 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883 Py_DECREF(unicode);
1884 return NULL;
1885}
1886
Tim Peters602f7402002-04-27 18:03:26 +00001887/* Allocation strategy: if the string is short, convert into a stack buffer
1888 and allocate exactly as much space needed at the end. Else allocate the
1889 maximum possible needed (4 result bytes per Unicode character), and return
1890 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001891*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001892PyObject *
1893PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001894 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001895 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001896{
Tim Peters602f7402002-04-27 18:03:26 +00001897#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001898
Martin v. Löwis18e16552006-02-15 17:27:45 +00001899 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001900 PyObject *v; /* result string object */
1901 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001902 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001903 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001904 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001905
Tim Peters602f7402002-04-27 18:03:26 +00001906 assert(s != NULL);
1907 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001908
Tim Peters602f7402002-04-27 18:03:26 +00001909 if (size <= MAX_SHORT_UNICHARS) {
1910 /* Write into the stack buffer; nallocated can't overflow.
1911 * At the end, we'll allocate exactly as much heap space as it
1912 * turns out we need.
1913 */
1914 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1915 v = NULL; /* will allocate after we're done */
1916 p = stackbuf;
1917 }
1918 else {
1919 /* Overallocate on the heap, and give the excess back at the end. */
1920 nallocated = size * 4;
1921 if (nallocated / 4 != size) /* overflow! */
1922 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001923 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001924 if (v == NULL)
1925 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001926 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001927 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001928
Tim Peters602f7402002-04-27 18:03:26 +00001929 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001930 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001931
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001932 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001933 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001935
Guido van Rossumd57fd912000-03-10 22:53:23 +00001936 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001937 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001938 *p++ = (char)(0xc0 | (ch >> 6));
1939 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001940 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001941 else {
Tim Peters602f7402002-04-27 18:03:26 +00001942 /* Encode UCS2 Unicode ordinals */
1943 if (ch < 0x10000) {
1944 /* Special case: check for high surrogate */
1945 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1946 Py_UCS4 ch2 = s[i];
1947 /* Check for low surrogate and combine the two to
1948 form a UCS4 value */
1949 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001950 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001951 i++;
1952 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001953 }
Tim Peters602f7402002-04-27 18:03:26 +00001954 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001955 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001956 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001957 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1958 *p++ = (char)(0x80 | (ch & 0x3f));
1959 continue;
1960 }
1961encodeUCS4:
1962 /* Encode UCS4 Unicode ordinals */
1963 *p++ = (char)(0xf0 | (ch >> 18));
1964 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1965 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1966 *p++ = (char)(0x80 | (ch & 0x3f));
1967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001969
Tim Peters602f7402002-04-27 18:03:26 +00001970 if (v == NULL) {
1971 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001972 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001973 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001974 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001975 }
1976 else {
1977 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001978 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001979 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001980 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001981 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001983
Tim Peters602f7402002-04-27 18:03:26 +00001984#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985}
1986
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1988{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 if (!PyUnicode_Check(unicode)) {
1990 PyErr_BadArgument();
1991 return NULL;
1992 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001993 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1994 PyUnicode_GET_SIZE(unicode),
1995 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996}
1997
Walter Dörwald41980ca2007-08-16 21:55:45 +00001998/* --- UTF-32 Codec ------------------------------------------------------- */
1999
2000PyObject *
2001PyUnicode_DecodeUTF32(const char *s,
2002 Py_ssize_t size,
2003 const char *errors,
2004 int *byteorder)
2005{
2006 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2007}
2008
2009PyObject *
2010PyUnicode_DecodeUTF32Stateful(const char *s,
2011 Py_ssize_t size,
2012 const char *errors,
2013 int *byteorder,
2014 Py_ssize_t *consumed)
2015{
2016 const char *starts = s;
2017 Py_ssize_t startinpos;
2018 Py_ssize_t endinpos;
2019 Py_ssize_t outpos;
2020 PyUnicodeObject *unicode;
2021 Py_UNICODE *p;
2022#ifndef Py_UNICODE_WIDE
2023 int i, pairs;
2024#else
2025 const int pairs = 0;
2026#endif
2027 const unsigned char *q, *e;
2028 int bo = 0; /* assume native ordering by default */
2029 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002030 /* Offsets from q for retrieving bytes in the right order. */
2031#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2032 int iorder[] = {0, 1, 2, 3};
2033#else
2034 int iorder[] = {3, 2, 1, 0};
2035#endif
2036 PyObject *errorHandler = NULL;
2037 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002038 /* On narrow builds we split characters outside the BMP into two
2039 codepoints => count how much extra space we need. */
2040#ifndef Py_UNICODE_WIDE
2041 for (i = pairs = 0; i < size/4; i++)
2042 if (((Py_UCS4 *)s)[i] >= 0x10000)
2043 pairs++;
2044#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002045
2046 /* This might be one to much, because of a BOM */
2047 unicode = _PyUnicode_New((size+3)/4+pairs);
2048 if (!unicode)
2049 return NULL;
2050 if (size == 0)
2051 return (PyObject *)unicode;
2052
2053 /* Unpack UTF-32 encoded data */
2054 p = unicode->str;
2055 q = (unsigned char *)s;
2056 e = q + size;
2057
2058 if (byteorder)
2059 bo = *byteorder;
2060
2061 /* Check for BOM marks (U+FEFF) in the input and adjust current
2062 byte order setting accordingly. In native mode, the leading BOM
2063 mark is skipped, in all other modes, it is copied to the output
2064 stream as-is (giving a ZWNBSP character). */
2065 if (bo == 0) {
2066 if (size >= 4) {
2067 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2068 (q[iorder[1]] << 8) | q[iorder[0]];
2069#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2070 if (bom == 0x0000FEFF) {
2071 q += 4;
2072 bo = -1;
2073 }
2074 else if (bom == 0xFFFE0000) {
2075 q += 4;
2076 bo = 1;
2077 }
2078#else
2079 if (bom == 0x0000FEFF) {
2080 q += 4;
2081 bo = 1;
2082 }
2083 else if (bom == 0xFFFE0000) {
2084 q += 4;
2085 bo = -1;
2086 }
2087#endif
2088 }
2089 }
2090
2091 if (bo == -1) {
2092 /* force LE */
2093 iorder[0] = 0;
2094 iorder[1] = 1;
2095 iorder[2] = 2;
2096 iorder[3] = 3;
2097 }
2098 else if (bo == 1) {
2099 /* force BE */
2100 iorder[0] = 3;
2101 iorder[1] = 2;
2102 iorder[2] = 1;
2103 iorder[3] = 0;
2104 }
2105
2106 while (q < e) {
2107 Py_UCS4 ch;
2108 /* remaining bytes at the end? (size should be divisible by 4) */
2109 if (e-q<4) {
2110 if (consumed)
2111 break;
2112 errmsg = "truncated data";
2113 startinpos = ((const char *)q)-starts;
2114 endinpos = ((const char *)e)-starts;
2115 goto utf32Error;
2116 /* The remaining input chars are ignored if the callback
2117 chooses to skip the input */
2118 }
2119 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2120 (q[iorder[1]] << 8) | q[iorder[0]];
2121
2122 if (ch >= 0x110000)
2123 {
2124 errmsg = "codepoint not in range(0x110000)";
2125 startinpos = ((const char *)q)-starts;
2126 endinpos = startinpos+4;
2127 goto utf32Error;
2128 }
2129#ifndef Py_UNICODE_WIDE
2130 if (ch >= 0x10000)
2131 {
2132 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2133 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2134 }
2135 else
2136#endif
2137 *p++ = ch;
2138 q += 4;
2139 continue;
2140 utf32Error:
2141 outpos = p-PyUnicode_AS_UNICODE(unicode);
2142 if (unicode_decode_call_errorhandler(
2143 errors, &errorHandler,
2144 "utf32", errmsg,
2145 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2146 (PyObject **)&unicode, &outpos, &p))
2147 goto onError;
2148 }
2149
2150 if (byteorder)
2151 *byteorder = bo;
2152
2153 if (consumed)
2154 *consumed = (const char *)q-starts;
2155
2156 /* Adjust length */
2157 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2158 goto onError;
2159
2160 Py_XDECREF(errorHandler);
2161 Py_XDECREF(exc);
2162 return (PyObject *)unicode;
2163
2164onError:
2165 Py_DECREF(unicode);
2166 Py_XDECREF(errorHandler);
2167 Py_XDECREF(exc);
2168 return NULL;
2169}
2170
2171PyObject *
2172PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2173 Py_ssize_t size,
2174 const char *errors,
2175 int byteorder)
2176{
2177 PyObject *v;
2178 unsigned char *p;
2179#ifndef Py_UNICODE_WIDE
2180 int i, pairs;
2181#else
2182 const int pairs = 0;
2183#endif
2184 /* Offsets from p for storing byte pairs in the right order. */
2185#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2186 int iorder[] = {0, 1, 2, 3};
2187#else
2188 int iorder[] = {3, 2, 1, 0};
2189#endif
2190
2191#define STORECHAR(CH) \
2192 do { \
2193 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2194 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2195 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2196 p[iorder[0]] = (CH) & 0xff; \
2197 p += 4; \
2198 } while(0)
2199
2200 /* In narrow builds we can output surrogate pairs as one codepoint,
2201 so we need less space. */
2202#ifndef Py_UNICODE_WIDE
2203 for (i = pairs = 0; i < size-1; i++)
2204 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2205 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2206 pairs++;
2207#endif
2208 v = PyBytes_FromStringAndSize(NULL,
2209 4 * (size - pairs + (byteorder == 0)));
2210 if (v == NULL)
2211 return NULL;
2212
2213 p = (unsigned char *)PyBytes_AS_STRING(v);
2214 if (byteorder == 0)
2215 STORECHAR(0xFEFF);
2216 if (size == 0)
2217 return v;
2218
2219 if (byteorder == -1) {
2220 /* force LE */
2221 iorder[0] = 0;
2222 iorder[1] = 1;
2223 iorder[2] = 2;
2224 iorder[3] = 3;
2225 }
2226 else if (byteorder == 1) {
2227 /* force BE */
2228 iorder[0] = 3;
2229 iorder[1] = 2;
2230 iorder[2] = 1;
2231 iorder[3] = 0;
2232 }
2233
2234 while (size-- > 0) {
2235 Py_UCS4 ch = *s++;
2236#ifndef Py_UNICODE_WIDE
2237 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2238 Py_UCS4 ch2 = *s;
2239 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2240 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2241 s++;
2242 size--;
2243 }
2244 }
2245#endif
2246 STORECHAR(ch);
2247 }
2248 return v;
2249#undef STORECHAR
2250}
2251
2252PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2253{
2254 if (!PyUnicode_Check(unicode)) {
2255 PyErr_BadArgument();
2256 return NULL;
2257 }
2258 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2259 PyUnicode_GET_SIZE(unicode),
2260 NULL,
2261 0);
2262}
2263
Guido van Rossumd57fd912000-03-10 22:53:23 +00002264/* --- UTF-16 Codec ------------------------------------------------------- */
2265
Tim Peters772747b2001-08-09 22:21:55 +00002266PyObject *
2267PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002268 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002269 const char *errors,
2270 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002271{
Walter Dörwald69652032004-09-07 20:24:22 +00002272 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2273}
2274
2275PyObject *
2276PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002277 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002278 const char *errors,
2279 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002280 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002281{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002282 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002283 Py_ssize_t startinpos;
2284 Py_ssize_t endinpos;
2285 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002286 PyUnicodeObject *unicode;
2287 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002288 const unsigned char *q, *e;
2289 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002290 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002291 /* Offsets from q for retrieving byte pairs in the right order. */
2292#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2293 int ihi = 1, ilo = 0;
2294#else
2295 int ihi = 0, ilo = 1;
2296#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002297 PyObject *errorHandler = NULL;
2298 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299
2300 /* Note: size will always be longer than the resulting Unicode
2301 character count */
2302 unicode = _PyUnicode_New(size);
2303 if (!unicode)
2304 return NULL;
2305 if (size == 0)
2306 return (PyObject *)unicode;
2307
2308 /* Unpack UTF-16 encoded data */
2309 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002310 q = (unsigned char *)s;
2311 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312
2313 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002314 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002315
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002316 /* Check for BOM marks (U+FEFF) in the input and adjust current
2317 byte order setting accordingly. In native mode, the leading BOM
2318 mark is skipped, in all other modes, it is copied to the output
2319 stream as-is (giving a ZWNBSP character). */
2320 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002321 if (size >= 2) {
2322 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002323#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002324 if (bom == 0xFEFF) {
2325 q += 2;
2326 bo = -1;
2327 }
2328 else if (bom == 0xFFFE) {
2329 q += 2;
2330 bo = 1;
2331 }
Tim Petersced69f82003-09-16 20:30:58 +00002332#else
Walter Dörwald69652032004-09-07 20:24:22 +00002333 if (bom == 0xFEFF) {
2334 q += 2;
2335 bo = 1;
2336 }
2337 else if (bom == 0xFFFE) {
2338 q += 2;
2339 bo = -1;
2340 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002341#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002342 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002343 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002344
Tim Peters772747b2001-08-09 22:21:55 +00002345 if (bo == -1) {
2346 /* force LE */
2347 ihi = 1;
2348 ilo = 0;
2349 }
2350 else if (bo == 1) {
2351 /* force BE */
2352 ihi = 0;
2353 ilo = 1;
2354 }
2355
2356 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002357 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002358 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002359 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002360 if (consumed)
2361 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002362 errmsg = "truncated data";
2363 startinpos = ((const char *)q)-starts;
2364 endinpos = ((const char *)e)-starts;
2365 goto utf16Error;
2366 /* The remaining input chars are ignored if the callback
2367 chooses to skip the input */
2368 }
2369 ch = (q[ihi] << 8) | q[ilo];
2370
Tim Peters772747b2001-08-09 22:21:55 +00002371 q += 2;
2372
Guido van Rossumd57fd912000-03-10 22:53:23 +00002373 if (ch < 0xD800 || ch > 0xDFFF) {
2374 *p++ = ch;
2375 continue;
2376 }
2377
2378 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002379 if (q >= e) {
2380 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002381 startinpos = (((const char *)q)-2)-starts;
2382 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002383 goto utf16Error;
2384 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002385 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002386 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2387 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002388 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002389#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002390 *p++ = ch;
2391 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002392#else
2393 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002394#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002395 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002396 }
2397 else {
2398 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002399 startinpos = (((const char *)q)-4)-starts;
2400 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002401 goto utf16Error;
2402 }
2403
Guido van Rossumd57fd912000-03-10 22:53:23 +00002404 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002405 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002406 startinpos = (((const char *)q)-2)-starts;
2407 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002408 /* Fall through to report the error */
2409
2410 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002411 outpos = p-PyUnicode_AS_UNICODE(unicode);
2412 if (unicode_decode_call_errorhandler(
2413 errors, &errorHandler,
2414 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002415 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002416 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002417 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418 }
2419
2420 if (byteorder)
2421 *byteorder = bo;
2422
Walter Dörwald69652032004-09-07 20:24:22 +00002423 if (consumed)
2424 *consumed = (const char *)q-starts;
2425
Guido van Rossumd57fd912000-03-10 22:53:23 +00002426 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002427 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002428 goto onError;
2429
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002430 Py_XDECREF(errorHandler);
2431 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002432 return (PyObject *)unicode;
2433
2434onError:
2435 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002436 Py_XDECREF(errorHandler);
2437 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002438 return NULL;
2439}
2440
Tim Peters772747b2001-08-09 22:21:55 +00002441PyObject *
2442PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002443 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002444 const char *errors,
2445 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002446{
2447 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002448 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002449#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002450 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002451#else
2452 const int pairs = 0;
2453#endif
Tim Peters772747b2001-08-09 22:21:55 +00002454 /* Offsets from p for storing byte pairs in the right order. */
2455#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2456 int ihi = 1, ilo = 0;
2457#else
2458 int ihi = 0, ilo = 1;
2459#endif
2460
2461#define STORECHAR(CH) \
2462 do { \
2463 p[ihi] = ((CH) >> 8) & 0xff; \
2464 p[ilo] = (CH) & 0xff; \
2465 p += 2; \
2466 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002468#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002469 for (i = pairs = 0; i < size; i++)
2470 if (s[i] >= 0x10000)
2471 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002472#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002473 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002474 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002475 if (v == NULL)
2476 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002477
Walter Dörwald3cc34522007-05-04 10:48:27 +00002478 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002480 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002481 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002482 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002483
2484 if (byteorder == -1) {
2485 /* force LE */
2486 ihi = 1;
2487 ilo = 0;
2488 }
2489 else if (byteorder == 1) {
2490 /* force BE */
2491 ihi = 0;
2492 ilo = 1;
2493 }
2494
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002495 while (size-- > 0) {
2496 Py_UNICODE ch = *s++;
2497 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002498#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002499 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002500 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2501 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002503#endif
Tim Peters772747b2001-08-09 22:21:55 +00002504 STORECHAR(ch);
2505 if (ch2)
2506 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002507 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002509#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510}
2511
2512PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2513{
2514 if (!PyUnicode_Check(unicode)) {
2515 PyErr_BadArgument();
2516 return NULL;
2517 }
2518 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2519 PyUnicode_GET_SIZE(unicode),
2520 NULL,
2521 0);
2522}
2523
2524/* --- Unicode Escape Codec ----------------------------------------------- */
2525
Fredrik Lundh06d12682001-01-24 07:59:11 +00002526static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002527
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002529 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530 const char *errors)
2531{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002532 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002533 Py_ssize_t startinpos;
2534 Py_ssize_t endinpos;
2535 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002536 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002538 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002540 char* message;
2541 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002542 PyObject *errorHandler = NULL;
2543 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002544
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545 /* Escaped strings will always be longer than the resulting
2546 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002547 length after conversion to the true value.
2548 (but if the error callback returns a long replacement string
2549 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 v = _PyUnicode_New(size);
2551 if (v == NULL)
2552 goto onError;
2553 if (size == 0)
2554 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002555
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002556 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002558
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 while (s < end) {
2560 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002561 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002562 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563
2564 /* Non-escape characters are interpreted as Unicode ordinals */
2565 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002566 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002567 continue;
2568 }
2569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002570 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571 /* \ - Escapes */
2572 s++;
2573 switch (*s++) {
2574
2575 /* \x escapes */
2576 case '\n': break;
2577 case '\\': *p++ = '\\'; break;
2578 case '\'': *p++ = '\''; break;
2579 case '\"': *p++ = '\"'; break;
2580 case 'b': *p++ = '\b'; break;
2581 case 'f': *p++ = '\014'; break; /* FF */
2582 case 't': *p++ = '\t'; break;
2583 case 'n': *p++ = '\n'; break;
2584 case 'r': *p++ = '\r'; break;
2585 case 'v': *p++ = '\013'; break; /* VT */
2586 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2587
2588 /* \OOO (octal) escapes */
2589 case '0': case '1': case '2': case '3':
2590 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002591 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002593 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002595 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002597 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598 break;
2599
Fredrik Lundhccc74732001-02-18 22:13:49 +00002600 /* hex escapes */
2601 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002603 digits = 2;
2604 message = "truncated \\xXX escape";
2605 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002606
Fredrik Lundhccc74732001-02-18 22:13:49 +00002607 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002609 digits = 4;
2610 message = "truncated \\uXXXX escape";
2611 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612
Fredrik Lundhccc74732001-02-18 22:13:49 +00002613 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002614 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002615 digits = 8;
2616 message = "truncated \\UXXXXXXXX escape";
2617 hexescape:
2618 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002619 outpos = p-PyUnicode_AS_UNICODE(v);
2620 if (s+digits>end) {
2621 endinpos = size;
2622 if (unicode_decode_call_errorhandler(
2623 errors, &errorHandler,
2624 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002625 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002626 (PyObject **)&v, &outpos, &p))
2627 goto onError;
2628 goto nextByte;
2629 }
2630 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002631 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002632 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002633 endinpos = (s+i+1)-starts;
2634 if (unicode_decode_call_errorhandler(
2635 errors, &errorHandler,
2636 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002637 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002638 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002639 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002640 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002641 }
2642 chr = (chr<<4) & ~0xF;
2643 if (c >= '0' && c <= '9')
2644 chr += c - '0';
2645 else if (c >= 'a' && c <= 'f')
2646 chr += 10 + c - 'a';
2647 else
2648 chr += 10 + c - 'A';
2649 }
2650 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002651 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002652 /* _decoding_error will have already written into the
2653 target buffer. */
2654 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002655 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002656 /* when we get here, chr is a 32-bit unicode character */
2657 if (chr <= 0xffff)
2658 /* UCS-2 character */
2659 *p++ = (Py_UNICODE) chr;
2660 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002661 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002662 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002663#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002664 *p++ = chr;
2665#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002666 chr -= 0x10000L;
2667 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002668 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002669#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002670 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002671 endinpos = s-starts;
2672 outpos = p-PyUnicode_AS_UNICODE(v);
2673 if (unicode_decode_call_errorhandler(
2674 errors, &errorHandler,
2675 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002676 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002677 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002678 goto onError;
2679 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002680 break;
2681
2682 /* \N{name} */
2683 case 'N':
2684 message = "malformed \\N character escape";
2685 if (ucnhash_CAPI == NULL) {
2686 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002687 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002688 m = PyImport_ImportModule("unicodedata");
2689 if (m == NULL)
2690 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002691 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002692 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002693 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002694 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002695 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002696 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002697 if (ucnhash_CAPI == NULL)
2698 goto ucnhashError;
2699 }
2700 if (*s == '{') {
2701 const char *start = s+1;
2702 /* look for the closing brace */
2703 while (*s != '}' && s < end)
2704 s++;
2705 if (s > start && s < end && *s == '}') {
2706 /* found a name. look it up in the unicode database */
2707 message = "unknown Unicode character name";
2708 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002709 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002710 goto store;
2711 }
2712 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002713 endinpos = s-starts;
2714 outpos = p-PyUnicode_AS_UNICODE(v);
2715 if (unicode_decode_call_errorhandler(
2716 errors, &errorHandler,
2717 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002718 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002719 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002720 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002721 break;
2722
2723 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002724 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002725 message = "\\ at end of string";
2726 s--;
2727 endinpos = s-starts;
2728 outpos = p-PyUnicode_AS_UNICODE(v);
2729 if (unicode_decode_call_errorhandler(
2730 errors, &errorHandler,
2731 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002732 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002733 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002734 goto onError;
2735 }
2736 else {
2737 *p++ = '\\';
2738 *p++ = (unsigned char)s[-1];
2739 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002740 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742 nextByte:
2743 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002745 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002747 Py_XDECREF(errorHandler);
2748 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002750
Fredrik Lundhccc74732001-02-18 22:13:49 +00002751ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002752 PyErr_SetString(
2753 PyExc_UnicodeError,
2754 "\\N escapes not supported (can't load unicodedata module)"
2755 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002756 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002757 Py_XDECREF(errorHandler);
2758 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002759 return NULL;
2760
Fredrik Lundhccc74732001-02-18 22:13:49 +00002761onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002763 Py_XDECREF(errorHandler);
2764 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765 return NULL;
2766}
2767
2768/* Return a Unicode-Escape string version of the Unicode object.
2769
2770 If quotes is true, the string is enclosed in u"" or u'' quotes as
2771 appropriate.
2772
2773*/
2774
Thomas Wouters477c8d52006-05-27 19:21:47 +00002775Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2776 Py_ssize_t size,
2777 Py_UNICODE ch)
2778{
2779 /* like wcschr, but doesn't stop at NULL characters */
2780
2781 while (size-- > 0) {
2782 if (*s == ch)
2783 return s;
2784 s++;
2785 }
2786
2787 return NULL;
2788}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002789
Walter Dörwald79e913e2007-05-12 11:08:06 +00002790static const char *hexdigits = "0123456789abcdef";
2791
2792PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2793 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794{
2795 PyObject *repr;
2796 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797
Thomas Wouters89f507f2006-12-13 04:49:30 +00002798 /* XXX(nnorwitz): rather than over-allocating, it would be
2799 better to choose a different scheme. Perhaps scan the
2800 first N-chars of the string and allocate based on that size.
2801 */
2802 /* Initial allocation is based on the longest-possible unichr
2803 escape.
2804
2805 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2806 unichr, so in this case it's the longest unichr escape. In
2807 narrow (UTF-16) builds this is five chars per source unichr
2808 since there are two unichrs in the surrogate pair, so in narrow
2809 (UTF-16) builds it's not the longest unichr escape.
2810
2811 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2812 so in the narrow (UTF-16) build case it's the longest unichr
2813 escape.
2814 */
2815
Walter Dörwald79e913e2007-05-12 11:08:06 +00002816 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002817#ifdef Py_UNICODE_WIDE
2818 + 10*size
2819#else
2820 + 6*size
2821#endif
2822 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823 if (repr == NULL)
2824 return NULL;
2825
Walter Dörwald79e913e2007-05-12 11:08:06 +00002826 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828 while (size-- > 0) {
2829 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002830
Walter Dörwald79e913e2007-05-12 11:08:06 +00002831 /* Escape backslashes */
2832 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833 *p++ = '\\';
2834 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002835 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002836 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002837
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002838#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002839 /* Map 21-bit characters to '\U00xxxxxx' */
2840 else if (ch >= 0x10000) {
2841 *p++ = '\\';
2842 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002843 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2844 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2845 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2846 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2847 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2848 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2849 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2850 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002851 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002852 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002853#else
2854 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002855 else if (ch >= 0xD800 && ch < 0xDC00) {
2856 Py_UNICODE ch2;
2857 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002858
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002859 ch2 = *s++;
2860 size--;
2861 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2862 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2863 *p++ = '\\';
2864 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002865 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2866 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2867 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2868 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2869 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2870 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2871 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2872 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002873 continue;
2874 }
2875 /* Fall through: isolated surrogates are copied as-is */
2876 s--;
2877 size++;
2878 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002879#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002880
Guido van Rossumd57fd912000-03-10 22:53:23 +00002881 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002882 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883 *p++ = '\\';
2884 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002885 *p++ = hexdigits[(ch >> 12) & 0x000F];
2886 *p++ = hexdigits[(ch >> 8) & 0x000F];
2887 *p++ = hexdigits[(ch >> 4) & 0x000F];
2888 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002890
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002891 /* Map special whitespace to '\t', \n', '\r' */
2892 else if (ch == '\t') {
2893 *p++ = '\\';
2894 *p++ = 't';
2895 }
2896 else if (ch == '\n') {
2897 *p++ = '\\';
2898 *p++ = 'n';
2899 }
2900 else if (ch == '\r') {
2901 *p++ = '\\';
2902 *p++ = 'r';
2903 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002904
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002905 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002906 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002908 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002909 *p++ = hexdigits[(ch >> 4) & 0x000F];
2910 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002911 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002912
Guido van Rossumd57fd912000-03-10 22:53:23 +00002913 /* Copy everything else as-is */
2914 else
2915 *p++ = (char) ch;
2916 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002917
2918 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002919 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2920 Py_DECREF(repr);
2921 return NULL;
2922 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923 return repr;
2924}
2925
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2927{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002928 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002929 if (!PyUnicode_Check(unicode)) {
2930 PyErr_BadArgument();
2931 return NULL;
2932 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002933 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2934 PyUnicode_GET_SIZE(unicode));
2935
2936 if (!s)
2937 return NULL;
2938 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2939 PyBytes_GET_SIZE(s));
2940 Py_DECREF(s);
2941 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002942}
2943
2944/* --- Raw Unicode Escape Codec ------------------------------------------- */
2945
2946PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002947 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002948 const char *errors)
2949{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002950 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002951 Py_ssize_t startinpos;
2952 Py_ssize_t endinpos;
2953 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002954 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002955 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956 const char *end;
2957 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002958 PyObject *errorHandler = NULL;
2959 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002960
Guido van Rossumd57fd912000-03-10 22:53:23 +00002961 /* Escaped strings will always be longer than the resulting
2962 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002963 length after conversion to the true value. (But decoding error
2964 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965 v = _PyUnicode_New(size);
2966 if (v == NULL)
2967 goto onError;
2968 if (size == 0)
2969 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002970 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971 end = s + size;
2972 while (s < end) {
2973 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002974 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002976 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977
2978 /* Non-escape characters are interpreted as Unicode ordinals */
2979 if (*s != '\\') {
2980 *p++ = (unsigned char)*s++;
2981 continue;
2982 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002983 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984
2985 /* \u-escapes are only interpreted iff the number of leading
2986 backslashes if odd */
2987 bs = s;
2988 for (;s < end;) {
2989 if (*s != '\\')
2990 break;
2991 *p++ = (unsigned char)*s++;
2992 }
2993 if (((s - bs) & 1) == 0 ||
2994 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002995 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996 continue;
2997 }
2998 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002999 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000 s++;
3001
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003002 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003003 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003004 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003005 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003006 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003007 endinpos = s-starts;
3008 if (unicode_decode_call_errorhandler(
3009 errors, &errorHandler,
3010 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003011 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003012 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003013 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003014 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015 }
3016 x = (x<<4) & ~0xF;
3017 if (c >= '0' && c <= '9')
3018 x += c - '0';
3019 else if (c >= 'a' && c <= 'f')
3020 x += 10 + c - 'a';
3021 else
3022 x += 10 + c - 'A';
3023 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003024#ifndef Py_UNICODE_WIDE
3025 if (x > 0x10000) {
3026 if (unicode_decode_call_errorhandler(
3027 errors, &errorHandler,
3028 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003029 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003030 (PyObject **)&v, &outpos, &p))
3031 goto onError;
3032 }
3033#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003034 *p++ = x;
3035 nextByte:
3036 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003038 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003039 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003040 Py_XDECREF(errorHandler);
3041 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003043
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044 onError:
3045 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003046 Py_XDECREF(errorHandler);
3047 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048 return NULL;
3049}
3050
3051PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003052 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053{
3054 PyObject *repr;
3055 char *p;
3056 char *q;
3057
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003058#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003059 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003060#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003061 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003062#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 if (repr == NULL)
3064 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003065 if (size == 0)
3066 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067
Walter Dörwald711005d2007-05-12 12:03:26 +00003068 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 while (size-- > 0) {
3070 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003071#ifdef Py_UNICODE_WIDE
3072 /* Map 32-bit characters to '\Uxxxxxxxx' */
3073 if (ch >= 0x10000) {
3074 *p++ = '\\';
3075 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003076 *p++ = hexdigits[(ch >> 28) & 0xf];
3077 *p++ = hexdigits[(ch >> 24) & 0xf];
3078 *p++ = hexdigits[(ch >> 20) & 0xf];
3079 *p++ = hexdigits[(ch >> 16) & 0xf];
3080 *p++ = hexdigits[(ch >> 12) & 0xf];
3081 *p++ = hexdigits[(ch >> 8) & 0xf];
3082 *p++ = hexdigits[(ch >> 4) & 0xf];
3083 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003084 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003085 else
3086#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 /* Map 16-bit characters to '\uxxxx' */
3088 if (ch >= 256) {
3089 *p++ = '\\';
3090 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003091 *p++ = hexdigits[(ch >> 12) & 0xf];
3092 *p++ = hexdigits[(ch >> 8) & 0xf];
3093 *p++ = hexdigits[(ch >> 4) & 0xf];
3094 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095 }
3096 /* Copy everything else as-is */
3097 else
3098 *p++ = (char) ch;
3099 }
3100 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00003101 if (PyBytes_Resize(repr, p - q)) {
3102 Py_DECREF(repr);
3103 return NULL;
3104 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 return repr;
3106}
3107
3108PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3109{
Walter Dörwald711005d2007-05-12 12:03:26 +00003110 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003112 PyErr_BadArgument();
3113 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003115 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3116 PyUnicode_GET_SIZE(unicode));
3117
3118 if (!s)
3119 return NULL;
3120 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3121 PyBytes_GET_SIZE(s));
3122 Py_DECREF(s);
3123 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124}
3125
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003126/* --- Unicode Internal Codec ------------------------------------------- */
3127
3128PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003129 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003130 const char *errors)
3131{
3132 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003133 Py_ssize_t startinpos;
3134 Py_ssize_t endinpos;
3135 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003136 PyUnicodeObject *v;
3137 Py_UNICODE *p;
3138 const char *end;
3139 const char *reason;
3140 PyObject *errorHandler = NULL;
3141 PyObject *exc = NULL;
3142
Neal Norwitzd43069c2006-01-08 01:12:10 +00003143#ifdef Py_UNICODE_WIDE
3144 Py_UNICODE unimax = PyUnicode_GetMax();
3145#endif
3146
Thomas Wouters89f507f2006-12-13 04:49:30 +00003147 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003148 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3149 if (v == NULL)
3150 goto onError;
3151 if (PyUnicode_GetSize((PyObject *)v) == 0)
3152 return (PyObject *)v;
3153 p = PyUnicode_AS_UNICODE(v);
3154 end = s + size;
3155
3156 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003157 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003158 /* We have to sanity check the raw data, otherwise doom looms for
3159 some malformed UCS-4 data. */
3160 if (
3161 #ifdef Py_UNICODE_WIDE
3162 *p > unimax || *p < 0 ||
3163 #endif
3164 end-s < Py_UNICODE_SIZE
3165 )
3166 {
3167 startinpos = s - starts;
3168 if (end-s < Py_UNICODE_SIZE) {
3169 endinpos = end-starts;
3170 reason = "truncated input";
3171 }
3172 else {
3173 endinpos = s - starts + Py_UNICODE_SIZE;
3174 reason = "illegal code point (> 0x10FFFF)";
3175 }
3176 outpos = p - PyUnicode_AS_UNICODE(v);
3177 if (unicode_decode_call_errorhandler(
3178 errors, &errorHandler,
3179 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003180 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003181 (PyObject **)&v, &outpos, &p)) {
3182 goto onError;
3183 }
3184 }
3185 else {
3186 p++;
3187 s += Py_UNICODE_SIZE;
3188 }
3189 }
3190
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003191 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003192 goto onError;
3193 Py_XDECREF(errorHandler);
3194 Py_XDECREF(exc);
3195 return (PyObject *)v;
3196
3197 onError:
3198 Py_XDECREF(v);
3199 Py_XDECREF(errorHandler);
3200 Py_XDECREF(exc);
3201 return NULL;
3202}
3203
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204/* --- Latin-1 Codec ------------------------------------------------------ */
3205
3206PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003207 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208 const char *errors)
3209{
3210 PyUnicodeObject *v;
3211 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003212
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003214 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003215 Py_UNICODE r = *(unsigned char*)s;
3216 return PyUnicode_FromUnicode(&r, 1);
3217 }
3218
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219 v = _PyUnicode_New(size);
3220 if (v == NULL)
3221 goto onError;
3222 if (size == 0)
3223 return (PyObject *)v;
3224 p = PyUnicode_AS_UNICODE(v);
3225 while (size-- > 0)
3226 *p++ = (unsigned char)*s++;
3227 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003228
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229 onError:
3230 Py_XDECREF(v);
3231 return NULL;
3232}
3233
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003234/* create or adjust a UnicodeEncodeError */
3235static void make_encode_exception(PyObject **exceptionObject,
3236 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003237 const Py_UNICODE *unicode, Py_ssize_t size,
3238 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003239 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003241 if (*exceptionObject == NULL) {
3242 *exceptionObject = PyUnicodeEncodeError_Create(
3243 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 }
3245 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003246 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3247 goto onError;
3248 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3249 goto onError;
3250 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3251 goto onError;
3252 return;
3253 onError:
3254 Py_DECREF(*exceptionObject);
3255 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256 }
3257}
3258
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003259/* raises a UnicodeEncodeError */
3260static void raise_encode_exception(PyObject **exceptionObject,
3261 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003262 const Py_UNICODE *unicode, Py_ssize_t size,
3263 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003264 const char *reason)
3265{
3266 make_encode_exception(exceptionObject,
3267 encoding, unicode, size, startpos, endpos, reason);
3268 if (*exceptionObject != NULL)
3269 PyCodec_StrictErrors(*exceptionObject);
3270}
3271
3272/* error handling callback helper:
3273 build arguments, call the callback and check the arguments,
3274 put the result into newpos and return the replacement string, which
3275 has to be freed by the caller */
3276static PyObject *unicode_encode_call_errorhandler(const char *errors,
3277 PyObject **errorHandler,
3278 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003279 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3280 Py_ssize_t startpos, Py_ssize_t endpos,
3281 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003282{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003283 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003284
3285 PyObject *restuple;
3286 PyObject *resunicode;
3287
3288 if (*errorHandler == NULL) {
3289 *errorHandler = PyCodec_LookupError(errors);
3290 if (*errorHandler == NULL)
3291 return NULL;
3292 }
3293
3294 make_encode_exception(exceptionObject,
3295 encoding, unicode, size, startpos, endpos, reason);
3296 if (*exceptionObject == NULL)
3297 return NULL;
3298
3299 restuple = PyObject_CallFunctionObjArgs(
3300 *errorHandler, *exceptionObject, NULL);
3301 if (restuple == NULL)
3302 return NULL;
3303 if (!PyTuple_Check(restuple)) {
3304 PyErr_Format(PyExc_TypeError, &argparse[4]);
3305 Py_DECREF(restuple);
3306 return NULL;
3307 }
3308 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3309 &resunicode, newpos)) {
3310 Py_DECREF(restuple);
3311 return NULL;
3312 }
3313 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003314 *newpos = size+*newpos;
3315 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003316 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003317 Py_DECREF(restuple);
3318 return NULL;
3319 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003320 Py_INCREF(resunicode);
3321 Py_DECREF(restuple);
3322 return resunicode;
3323}
3324
3325static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003326 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003327 const char *errors,
3328 int limit)
3329{
3330 /* output object */
3331 PyObject *res;
3332 /* pointers to the beginning and end+1 of input */
3333 const Py_UNICODE *startp = p;
3334 const Py_UNICODE *endp = p + size;
3335 /* pointer to the beginning of the unencodable characters */
3336 /* const Py_UNICODE *badp = NULL; */
3337 /* pointer into the output */
3338 char *str;
3339 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003340 Py_ssize_t respos = 0;
3341 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003342 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3343 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003344 PyObject *errorHandler = NULL;
3345 PyObject *exc = NULL;
3346 /* the following variable is used for caching string comparisons
3347 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3348 int known_errorHandler = -1;
3349
3350 /* allocate enough for a simple encoding without
3351 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003352 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003353 if (res == NULL)
3354 goto onError;
3355 if (size == 0)
3356 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003357 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003358 ressize = size;
3359
3360 while (p<endp) {
3361 Py_UNICODE c = *p;
3362
3363 /* can we encode this? */
3364 if (c<limit) {
3365 /* no overflow check, because we know that the space is enough */
3366 *str++ = (char)c;
3367 ++p;
3368 }
3369 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003370 Py_ssize_t unicodepos = p-startp;
3371 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003372 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003373 Py_ssize_t repsize;
3374 Py_ssize_t newpos;
3375 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003376 Py_UNICODE *uni2;
3377 /* startpos for collecting unencodable chars */
3378 const Py_UNICODE *collstart = p;
3379 const Py_UNICODE *collend = p;
3380 /* find all unecodable characters */
3381 while ((collend < endp) && ((*collend)>=limit))
3382 ++collend;
3383 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3384 if (known_errorHandler==-1) {
3385 if ((errors==NULL) || (!strcmp(errors, "strict")))
3386 known_errorHandler = 1;
3387 else if (!strcmp(errors, "replace"))
3388 known_errorHandler = 2;
3389 else if (!strcmp(errors, "ignore"))
3390 known_errorHandler = 3;
3391 else if (!strcmp(errors, "xmlcharrefreplace"))
3392 known_errorHandler = 4;
3393 else
3394 known_errorHandler = 0;
3395 }
3396 switch (known_errorHandler) {
3397 case 1: /* strict */
3398 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3399 goto onError;
3400 case 2: /* replace */
3401 while (collstart++<collend)
3402 *str++ = '?'; /* fall through */
3403 case 3: /* ignore */
3404 p = collend;
3405 break;
3406 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003407 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003408 /* determine replacement size (temporarily (mis)uses p) */
3409 for (p = collstart, repsize = 0; p < collend; ++p) {
3410 if (*p<10)
3411 repsize += 2+1+1;
3412 else if (*p<100)
3413 repsize += 2+2+1;
3414 else if (*p<1000)
3415 repsize += 2+3+1;
3416 else if (*p<10000)
3417 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003418#ifndef Py_UNICODE_WIDE
3419 else
3420 repsize += 2+5+1;
3421#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003422 else if (*p<100000)
3423 repsize += 2+5+1;
3424 else if (*p<1000000)
3425 repsize += 2+6+1;
3426 else
3427 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003428#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429 }
3430 requiredsize = respos+repsize+(endp-collend);
3431 if (requiredsize > ressize) {
3432 if (requiredsize<2*ressize)
3433 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003434 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003435 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003436 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003437 ressize = requiredsize;
3438 }
3439 /* generate replacement (temporarily (mis)uses p) */
3440 for (p = collstart; p < collend; ++p) {
3441 str += sprintf(str, "&#%d;", (int)*p);
3442 }
3443 p = collend;
3444 break;
3445 default:
3446 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3447 encoding, reason, startp, size, &exc,
3448 collstart-startp, collend-startp, &newpos);
3449 if (repunicode == NULL)
3450 goto onError;
3451 /* need more space? (at least enough for what we
3452 have+the replacement+the rest of the string, so
3453 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003454 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003455 repsize = PyUnicode_GET_SIZE(repunicode);
3456 requiredsize = respos+repsize+(endp-collend);
3457 if (requiredsize > ressize) {
3458 if (requiredsize<2*ressize)
3459 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003460 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003461 Py_DECREF(repunicode);
3462 goto onError;
3463 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003464 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003465 ressize = requiredsize;
3466 }
3467 /* check if there is anything unencodable in the replacement
3468 and copy it to the output */
3469 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3470 c = *uni2;
3471 if (c >= limit) {
3472 raise_encode_exception(&exc, encoding, startp, size,
3473 unicodepos, unicodepos+1, reason);
3474 Py_DECREF(repunicode);
3475 goto onError;
3476 }
3477 *str = (char)c;
3478 }
3479 p = startp + newpos;
3480 Py_DECREF(repunicode);
3481 }
3482 }
3483 }
3484 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003485 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003486 if (respos<ressize)
3487 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003488 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003489 Py_XDECREF(errorHandler);
3490 Py_XDECREF(exc);
3491 return res;
3492
3493 onError:
3494 Py_XDECREF(res);
3495 Py_XDECREF(errorHandler);
3496 Py_XDECREF(exc);
3497 return NULL;
3498}
3499
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003501 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502 const char *errors)
3503{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003505}
3506
3507PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3508{
3509 if (!PyUnicode_Check(unicode)) {
3510 PyErr_BadArgument();
3511 return NULL;
3512 }
3513 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3514 PyUnicode_GET_SIZE(unicode),
3515 NULL);
3516}
3517
3518/* --- 7-bit ASCII Codec -------------------------------------------------- */
3519
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003521 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522 const char *errors)
3523{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003524 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525 PyUnicodeObject *v;
3526 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003527 Py_ssize_t startinpos;
3528 Py_ssize_t endinpos;
3529 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 const char *e;
3531 PyObject *errorHandler = NULL;
3532 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003533
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003535 if (size == 1 && *(unsigned char*)s < 128) {
3536 Py_UNICODE r = *(unsigned char*)s;
3537 return PyUnicode_FromUnicode(&r, 1);
3538 }
Tim Petersced69f82003-09-16 20:30:58 +00003539
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540 v = _PyUnicode_New(size);
3541 if (v == NULL)
3542 goto onError;
3543 if (size == 0)
3544 return (PyObject *)v;
3545 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 e = s + size;
3547 while (s < e) {
3548 register unsigned char c = (unsigned char)*s;
3549 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003550 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551 ++s;
3552 }
3553 else {
3554 startinpos = s-starts;
3555 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003556 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 if (unicode_decode_call_errorhandler(
3558 errors, &errorHandler,
3559 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003560 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003565 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003566 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003567 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003568 Py_XDECREF(errorHandler);
3569 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003570 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003571
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572 onError:
3573 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 Py_XDECREF(errorHandler);
3575 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003576 return NULL;
3577}
3578
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003580 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003581 const char *errors)
3582{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003583 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584}
3585
3586PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3587{
3588 if (!PyUnicode_Check(unicode)) {
3589 PyErr_BadArgument();
3590 return NULL;
3591 }
3592 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3593 PyUnicode_GET_SIZE(unicode),
3594 NULL);
3595}
3596
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003597#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003598
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003599/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003600
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003601#if SIZEOF_INT < SIZEOF_SSIZE_T
3602#define NEED_RETRY
3603#endif
3604
3605/* XXX This code is limited to "true" double-byte encodings, as
3606 a) it assumes an incomplete character consists of a single byte, and
3607 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3608 encodings, see IsDBCSLeadByteEx documentation. */
3609
3610static int is_dbcs_lead_byte(const char *s, int offset)
3611{
3612 const char *curr = s + offset;
3613
3614 if (IsDBCSLeadByte(*curr)) {
3615 const char *prev = CharPrev(s, curr);
3616 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3617 }
3618 return 0;
3619}
3620
3621/*
3622 * Decode MBCS string into unicode object. If 'final' is set, converts
3623 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3624 */
3625static int decode_mbcs(PyUnicodeObject **v,
3626 const char *s, /* MBCS string */
3627 int size, /* sizeof MBCS string */
3628 int final)
3629{
3630 Py_UNICODE *p;
3631 Py_ssize_t n = 0;
3632 int usize = 0;
3633
3634 assert(size >= 0);
3635
3636 /* Skip trailing lead-byte unless 'final' is set */
3637 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3638 --size;
3639
3640 /* First get the size of the result */
3641 if (size > 0) {
3642 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3643 if (usize == 0) {
3644 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3645 return -1;
3646 }
3647 }
3648
3649 if (*v == NULL) {
3650 /* Create unicode object */
3651 *v = _PyUnicode_New(usize);
3652 if (*v == NULL)
3653 return -1;
3654 }
3655 else {
3656 /* Extend unicode object */
3657 n = PyUnicode_GET_SIZE(*v);
3658 if (_PyUnicode_Resize(v, n + usize) < 0)
3659 return -1;
3660 }
3661
3662 /* Do the conversion */
3663 if (size > 0) {
3664 p = PyUnicode_AS_UNICODE(*v) + n;
3665 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3666 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3667 return -1;
3668 }
3669 }
3670
3671 return size;
3672}
3673
3674PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3675 Py_ssize_t size,
3676 const char *errors,
3677 Py_ssize_t *consumed)
3678{
3679 PyUnicodeObject *v = NULL;
3680 int done;
3681
3682 if (consumed)
3683 *consumed = 0;
3684
3685#ifdef NEED_RETRY
3686 retry:
3687 if (size > INT_MAX)
3688 done = decode_mbcs(&v, s, INT_MAX, 0);
3689 else
3690#endif
3691 done = decode_mbcs(&v, s, (int)size, !consumed);
3692
3693 if (done < 0) {
3694 Py_XDECREF(v);
3695 return NULL;
3696 }
3697
3698 if (consumed)
3699 *consumed += done;
3700
3701#ifdef NEED_RETRY
3702 if (size > INT_MAX) {
3703 s += done;
3704 size -= done;
3705 goto retry;
3706 }
3707#endif
3708
3709 return (PyObject *)v;
3710}
3711
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003712PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003713 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003714 const char *errors)
3715{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003716 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3717}
3718
3719/*
3720 * Convert unicode into string object (MBCS).
3721 * Returns 0 if succeed, -1 otherwise.
3722 */
3723static int encode_mbcs(PyObject **repr,
3724 const Py_UNICODE *p, /* unicode */
3725 int size) /* size of unicode */
3726{
3727 int mbcssize = 0;
3728 Py_ssize_t n = 0;
3729
3730 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003731
3732 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003733 if (size > 0) {
3734 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3735 if (mbcssize == 0) {
3736 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3737 return -1;
3738 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003739 }
3740
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003741 if (*repr == NULL) {
3742 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003743 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003744 if (*repr == NULL)
3745 return -1;
3746 }
3747 else {
3748 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003749 n = PyBytes_Size(*repr);
3750 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003751 return -1;
3752 }
3753
3754 /* Do the conversion */
3755 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003756 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003757 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3758 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3759 return -1;
3760 }
3761 }
3762
3763 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003764}
3765
3766PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003767 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003768 const char *errors)
3769{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003770 PyObject *repr = NULL;
3771 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003772
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003773#ifdef NEED_RETRY
3774 retry:
3775 if (size > INT_MAX)
3776 ret = encode_mbcs(&repr, p, INT_MAX);
3777 else
3778#endif
3779 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003780
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003781 if (ret < 0) {
3782 Py_XDECREF(repr);
3783 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003784 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003785
3786#ifdef NEED_RETRY
3787 if (size > INT_MAX) {
3788 p += INT_MAX;
3789 size -= INT_MAX;
3790 goto retry;
3791 }
3792#endif
3793
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003794 return repr;
3795}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003796
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003797PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3798{
3799 if (!PyUnicode_Check(unicode)) {
3800 PyErr_BadArgument();
3801 return NULL;
3802 }
3803 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3804 PyUnicode_GET_SIZE(unicode),
3805 NULL);
3806}
3807
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003808#undef NEED_RETRY
3809
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003810#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003811
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812/* --- Character Mapping Codec -------------------------------------------- */
3813
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003815 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 PyObject *mapping,
3817 const char *errors)
3818{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003819 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003820 Py_ssize_t startinpos;
3821 Py_ssize_t endinpos;
3822 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003823 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824 PyUnicodeObject *v;
3825 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003826 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003827 PyObject *errorHandler = NULL;
3828 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003829 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003830 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003831
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832 /* Default to Latin-1 */
3833 if (mapping == NULL)
3834 return PyUnicode_DecodeLatin1(s, size, errors);
3835
3836 v = _PyUnicode_New(size);
3837 if (v == NULL)
3838 goto onError;
3839 if (size == 0)
3840 return (PyObject *)v;
3841 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003842 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003843 if (PyUnicode_CheckExact(mapping)) {
3844 mapstring = PyUnicode_AS_UNICODE(mapping);
3845 maplen = PyUnicode_GET_SIZE(mapping);
3846 while (s < e) {
3847 unsigned char ch = *s;
3848 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003850 if (ch < maplen)
3851 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003852
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003853 if (x == 0xfffe) {
3854 /* undefined mapping */
3855 outpos = p-PyUnicode_AS_UNICODE(v);
3856 startinpos = s-starts;
3857 endinpos = startinpos+1;
3858 if (unicode_decode_call_errorhandler(
3859 errors, &errorHandler,
3860 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003861 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003862 (PyObject **)&v, &outpos, &p)) {
3863 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003864 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003865 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003866 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003867 *p++ = x;
3868 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003870 }
3871 else {
3872 while (s < e) {
3873 unsigned char ch = *s;
3874 PyObject *w, *x;
3875
3876 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3877 w = PyInt_FromLong((long)ch);
3878 if (w == NULL)
3879 goto onError;
3880 x = PyObject_GetItem(mapping, w);
3881 Py_DECREF(w);
3882 if (x == NULL) {
3883 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3884 /* No mapping found means: mapping is undefined. */
3885 PyErr_Clear();
3886 x = Py_None;
3887 Py_INCREF(x);
3888 } else
3889 goto onError;
3890 }
3891
3892 /* Apply mapping */
3893 if (PyInt_Check(x)) {
3894 long value = PyInt_AS_LONG(x);
3895 if (value < 0 || value > 65535) {
3896 PyErr_SetString(PyExc_TypeError,
3897 "character mapping must be in range(65536)");
3898 Py_DECREF(x);
3899 goto onError;
3900 }
3901 *p++ = (Py_UNICODE)value;
3902 }
3903 else if (x == Py_None) {
3904 /* undefined mapping */
3905 outpos = p-PyUnicode_AS_UNICODE(v);
3906 startinpos = s-starts;
3907 endinpos = startinpos+1;
3908 if (unicode_decode_call_errorhandler(
3909 errors, &errorHandler,
3910 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003911 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003912 (PyObject **)&v, &outpos, &p)) {
3913 Py_DECREF(x);
3914 goto onError;
3915 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003916 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003917 continue;
3918 }
3919 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003920 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003921
3922 if (targetsize == 1)
3923 /* 1-1 mapping */
3924 *p++ = *PyUnicode_AS_UNICODE(x);
3925
3926 else if (targetsize > 1) {
3927 /* 1-n mapping */
3928 if (targetsize > extrachars) {
3929 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003930 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3931 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003932 (targetsize << 2);
3933 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003934 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003935 if (_PyUnicode_Resize(&v,
3936 PyUnicode_GET_SIZE(v) + needed) < 0) {
3937 Py_DECREF(x);
3938 goto onError;
3939 }
3940 p = PyUnicode_AS_UNICODE(v) + oldpos;
3941 }
3942 Py_UNICODE_COPY(p,
3943 PyUnicode_AS_UNICODE(x),
3944 targetsize);
3945 p += targetsize;
3946 extrachars -= targetsize;
3947 }
3948 /* 1-0 mapping: skip the character */
3949 }
3950 else {
3951 /* wrong return value */
3952 PyErr_SetString(PyExc_TypeError,
3953 "character mapping must return integer, None or unicode");
3954 Py_DECREF(x);
3955 goto onError;
3956 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003958 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960 }
3961 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003962 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 Py_XDECREF(errorHandler);
3965 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003967
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003969 Py_XDECREF(errorHandler);
3970 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971 Py_XDECREF(v);
3972 return NULL;
3973}
3974
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003975/* Charmap encoding: the lookup table */
3976
3977struct encoding_map{
3978 PyObject_HEAD
3979 unsigned char level1[32];
3980 int count2, count3;
3981 unsigned char level23[1];
3982};
3983
3984static PyObject*
3985encoding_map_size(PyObject *obj, PyObject* args)
3986{
3987 struct encoding_map *map = (struct encoding_map*)obj;
3988 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3989 128*map->count3);
3990}
3991
3992static PyMethodDef encoding_map_methods[] = {
3993 {"size", encoding_map_size, METH_NOARGS,
3994 PyDoc_STR("Return the size (in bytes) of this object") },
3995 { 0 }
3996};
3997
3998static void
3999encoding_map_dealloc(PyObject* o)
4000{
4001 PyObject_FREE(o);
4002}
4003
4004static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004005 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004006 "EncodingMap", /*tp_name*/
4007 sizeof(struct encoding_map), /*tp_basicsize*/
4008 0, /*tp_itemsize*/
4009 /* methods */
4010 encoding_map_dealloc, /*tp_dealloc*/
4011 0, /*tp_print*/
4012 0, /*tp_getattr*/
4013 0, /*tp_setattr*/
4014 0, /*tp_compare*/
4015 0, /*tp_repr*/
4016 0, /*tp_as_number*/
4017 0, /*tp_as_sequence*/
4018 0, /*tp_as_mapping*/
4019 0, /*tp_hash*/
4020 0, /*tp_call*/
4021 0, /*tp_str*/
4022 0, /*tp_getattro*/
4023 0, /*tp_setattro*/
4024 0, /*tp_as_buffer*/
4025 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4026 0, /*tp_doc*/
4027 0, /*tp_traverse*/
4028 0, /*tp_clear*/
4029 0, /*tp_richcompare*/
4030 0, /*tp_weaklistoffset*/
4031 0, /*tp_iter*/
4032 0, /*tp_iternext*/
4033 encoding_map_methods, /*tp_methods*/
4034 0, /*tp_members*/
4035 0, /*tp_getset*/
4036 0, /*tp_base*/
4037 0, /*tp_dict*/
4038 0, /*tp_descr_get*/
4039 0, /*tp_descr_set*/
4040 0, /*tp_dictoffset*/
4041 0, /*tp_init*/
4042 0, /*tp_alloc*/
4043 0, /*tp_new*/
4044 0, /*tp_free*/
4045 0, /*tp_is_gc*/
4046};
4047
4048PyObject*
4049PyUnicode_BuildEncodingMap(PyObject* string)
4050{
4051 Py_UNICODE *decode;
4052 PyObject *result;
4053 struct encoding_map *mresult;
4054 int i;
4055 int need_dict = 0;
4056 unsigned char level1[32];
4057 unsigned char level2[512];
4058 unsigned char *mlevel1, *mlevel2, *mlevel3;
4059 int count2 = 0, count3 = 0;
4060
4061 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4062 PyErr_BadArgument();
4063 return NULL;
4064 }
4065 decode = PyUnicode_AS_UNICODE(string);
4066 memset(level1, 0xFF, sizeof level1);
4067 memset(level2, 0xFF, sizeof level2);
4068
4069 /* If there isn't a one-to-one mapping of NULL to \0,
4070 or if there are non-BMP characters, we need to use
4071 a mapping dictionary. */
4072 if (decode[0] != 0)
4073 need_dict = 1;
4074 for (i = 1; i < 256; i++) {
4075 int l1, l2;
4076 if (decode[i] == 0
4077 #ifdef Py_UNICODE_WIDE
4078 || decode[i] > 0xFFFF
4079 #endif
4080 ) {
4081 need_dict = 1;
4082 break;
4083 }
4084 if (decode[i] == 0xFFFE)
4085 /* unmapped character */
4086 continue;
4087 l1 = decode[i] >> 11;
4088 l2 = decode[i] >> 7;
4089 if (level1[l1] == 0xFF)
4090 level1[l1] = count2++;
4091 if (level2[l2] == 0xFF)
4092 level2[l2] = count3++;
4093 }
4094
4095 if (count2 >= 0xFF || count3 >= 0xFF)
4096 need_dict = 1;
4097
4098 if (need_dict) {
4099 PyObject *result = PyDict_New();
4100 PyObject *key, *value;
4101 if (!result)
4102 return NULL;
4103 for (i = 0; i < 256; i++) {
4104 key = value = NULL;
4105 key = PyInt_FromLong(decode[i]);
4106 value = PyInt_FromLong(i);
4107 if (!key || !value)
4108 goto failed1;
4109 if (PyDict_SetItem(result, key, value) == -1)
4110 goto failed1;
4111 Py_DECREF(key);
4112 Py_DECREF(value);
4113 }
4114 return result;
4115 failed1:
4116 Py_XDECREF(key);
4117 Py_XDECREF(value);
4118 Py_DECREF(result);
4119 return NULL;
4120 }
4121
4122 /* Create a three-level trie */
4123 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4124 16*count2 + 128*count3 - 1);
4125 if (!result)
4126 return PyErr_NoMemory();
4127 PyObject_Init(result, &EncodingMapType);
4128 mresult = (struct encoding_map*)result;
4129 mresult->count2 = count2;
4130 mresult->count3 = count3;
4131 mlevel1 = mresult->level1;
4132 mlevel2 = mresult->level23;
4133 mlevel3 = mresult->level23 + 16*count2;
4134 memcpy(mlevel1, level1, 32);
4135 memset(mlevel2, 0xFF, 16*count2);
4136 memset(mlevel3, 0, 128*count3);
4137 count3 = 0;
4138 for (i = 1; i < 256; i++) {
4139 int o1, o2, o3, i2, i3;
4140 if (decode[i] == 0xFFFE)
4141 /* unmapped character */
4142 continue;
4143 o1 = decode[i]>>11;
4144 o2 = (decode[i]>>7) & 0xF;
4145 i2 = 16*mlevel1[o1] + o2;
4146 if (mlevel2[i2] == 0xFF)
4147 mlevel2[i2] = count3++;
4148 o3 = decode[i] & 0x7F;
4149 i3 = 128*mlevel2[i2] + o3;
4150 mlevel3[i3] = i;
4151 }
4152 return result;
4153}
4154
4155static int
4156encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4157{
4158 struct encoding_map *map = (struct encoding_map*)mapping;
4159 int l1 = c>>11;
4160 int l2 = (c>>7) & 0xF;
4161 int l3 = c & 0x7F;
4162 int i;
4163
4164#ifdef Py_UNICODE_WIDE
4165 if (c > 0xFFFF) {
4166 return -1;
4167 }
4168#endif
4169 if (c == 0)
4170 return 0;
4171 /* level 1*/
4172 i = map->level1[l1];
4173 if (i == 0xFF) {
4174 return -1;
4175 }
4176 /* level 2*/
4177 i = map->level23[16*i+l2];
4178 if (i == 0xFF) {
4179 return -1;
4180 }
4181 /* level 3 */
4182 i = map->level23[16*map->count2 + 128*i + l3];
4183 if (i == 0) {
4184 return -1;
4185 }
4186 return i;
4187}
4188
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189/* Lookup the character ch in the mapping. If the character
4190 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004191 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004193{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194 PyObject *w = PyInt_FromLong((long)c);
4195 PyObject *x;
4196
4197 if (w == NULL)
4198 return NULL;
4199 x = PyObject_GetItem(mapping, w);
4200 Py_DECREF(w);
4201 if (x == NULL) {
4202 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4203 /* No mapping found means: mapping is undefined. */
4204 PyErr_Clear();
4205 x = Py_None;
4206 Py_INCREF(x);
4207 return x;
4208 } else
4209 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004210 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004211 else if (x == Py_None)
4212 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 else if (PyInt_Check(x)) {
4214 long value = PyInt_AS_LONG(x);
4215 if (value < 0 || value > 255) {
4216 PyErr_SetString(PyExc_TypeError,
4217 "character mapping must be in range(256)");
4218 Py_DECREF(x);
4219 return NULL;
4220 }
4221 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004222 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004223 else if (PyString_Check(x))
4224 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004225 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004226 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004227 PyErr_Format(PyExc_TypeError,
4228 "character mapping must return integer, None or str8, not %.400s",
4229 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230 Py_DECREF(x);
4231 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004232 }
4233}
4234
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004235static int
Walter Dörwald827b0552007-05-12 13:23:53 +00004236charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004237{
Walter Dörwald827b0552007-05-12 13:23:53 +00004238 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004239 /* exponentially overallocate to minimize reallocations */
4240 if (requiredsize < 2*outsize)
4241 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00004242 if (PyBytes_Resize(outobj, requiredsize)) {
4243 Py_DECREF(outobj);
4244 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004245 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004246 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004247}
4248
4249typedef enum charmapencode_result {
4250 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4251}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004253 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254 space is available. Return a new reference to the object that
4255 was put in the output buffer, or Py_None, if the mapping was undefined
4256 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004257 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004259charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00004260 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004262 PyObject *rep;
4263 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00004264 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004265
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004266 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004267 int res = encoding_map_lookup(c, mapping);
4268 Py_ssize_t requiredsize = *outpos+1;
4269 if (res == -1)
4270 return enc_FAILED;
4271 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004272 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004273 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004274 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004275 outstart[(*outpos)++] = (char)res;
4276 return enc_SUCCESS;
4277 }
4278
4279 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004280 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004281 return enc_EXCEPTION;
4282 else if (rep==Py_None) {
4283 Py_DECREF(rep);
4284 return enc_FAILED;
4285 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004287 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004288 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004289 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004291 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004292 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004293 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4295 }
4296 else {
4297 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004298 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4299 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004300 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004301 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004303 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004304 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004305 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004306 memcpy(outstart + *outpos, repchars, repsize);
4307 *outpos += repsize;
4308 }
4309 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004310 Py_DECREF(rep);
4311 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004312}
4313
4314/* handle an error in PyUnicode_EncodeCharmap
4315 Return 0 on success, -1 on error */
4316static
4317int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004318 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004320 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004321 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004322{
4323 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004324 Py_ssize_t repsize;
4325 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004326 Py_UNICODE *uni2;
4327 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004328 Py_ssize_t collstartpos = *inpos;
4329 Py_ssize_t collendpos = *inpos+1;
4330 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004331 char *encoding = "charmap";
4332 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004333 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004334
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004335 /* find all unencodable characters */
4336 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004337 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004338 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004339 int res = encoding_map_lookup(p[collendpos], mapping);
4340 if (res != -1)
4341 break;
4342 ++collendpos;
4343 continue;
4344 }
4345
4346 rep = charmapencode_lookup(p[collendpos], mapping);
4347 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004348 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004349 else if (rep!=Py_None) {
4350 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004351 break;
4352 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004353 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004354 ++collendpos;
4355 }
4356 /* cache callback name lookup
4357 * (if not done yet, i.e. it's the first error) */
4358 if (*known_errorHandler==-1) {
4359 if ((errors==NULL) || (!strcmp(errors, "strict")))
4360 *known_errorHandler = 1;
4361 else if (!strcmp(errors, "replace"))
4362 *known_errorHandler = 2;
4363 else if (!strcmp(errors, "ignore"))
4364 *known_errorHandler = 3;
4365 else if (!strcmp(errors, "xmlcharrefreplace"))
4366 *known_errorHandler = 4;
4367 else
4368 *known_errorHandler = 0;
4369 }
4370 switch (*known_errorHandler) {
4371 case 1: /* strict */
4372 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4373 return -1;
4374 case 2: /* replace */
4375 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4376 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004377 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 return -1;
4379 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004380 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004381 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4382 return -1;
4383 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004384 }
4385 /* fall through */
4386 case 3: /* ignore */
4387 *inpos = collendpos;
4388 break;
4389 case 4: /* xmlcharrefreplace */
4390 /* generate replacement (temporarily (mis)uses p) */
4391 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4392 char buffer[2+29+1+1];
4393 char *cp;
4394 sprintf(buffer, "&#%d;", (int)p[collpos]);
4395 for (cp = buffer; *cp; ++cp) {
4396 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004397 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004399 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4401 return -1;
4402 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403 }
4404 }
4405 *inpos = collendpos;
4406 break;
4407 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004408 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 encoding, reason, p, size, exceptionObject,
4410 collstartpos, collendpos, &newpos);
4411 if (repunicode == NULL)
4412 return -1;
4413 /* generate replacement */
4414 repsize = PyUnicode_GET_SIZE(repunicode);
4415 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4416 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004417 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 return -1;
4419 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004420 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004422 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4423 return -1;
4424 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 }
4426 *inpos = newpos;
4427 Py_DECREF(repunicode);
4428 }
4429 return 0;
4430}
4431
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004433 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434 PyObject *mapping,
4435 const char *errors)
4436{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437 /* output object */
4438 PyObject *res = NULL;
4439 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004440 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004442 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 PyObject *errorHandler = NULL;
4444 PyObject *exc = NULL;
4445 /* the following variable is used for caching string comparisons
4446 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4447 * 3=ignore, 4=xmlcharrefreplace */
4448 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449
4450 /* Default to Latin-1 */
4451 if (mapping == NULL)
4452 return PyUnicode_EncodeLatin1(p, size, errors);
4453
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004454 /* allocate enough for a simple encoding without
4455 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004456 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004457 if (res == NULL)
4458 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004459 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004460 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004462 while (inpos<size) {
4463 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004464 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004465 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004467 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004468 if (charmap_encoding_error(p, size, &inpos, mapping,
4469 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004470 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004471 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004472 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004473 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004475 else
4476 /* done with this character => adjust input position */
4477 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004480 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004481 if (respos<PyBytes_GET_SIZE(res)) {
4482 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004483 goto onError;
4484 }
4485 Py_XDECREF(exc);
4486 Py_XDECREF(errorHandler);
4487 return res;
4488
4489 onError:
4490 Py_XDECREF(res);
4491 Py_XDECREF(exc);
4492 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493 return NULL;
4494}
4495
4496PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4497 PyObject *mapping)
4498{
4499 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4500 PyErr_BadArgument();
4501 return NULL;
4502 }
4503 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4504 PyUnicode_GET_SIZE(unicode),
4505 mapping,
4506 NULL);
4507}
4508
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509/* create or adjust a UnicodeTranslateError */
4510static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004511 const Py_UNICODE *unicode, Py_ssize_t size,
4512 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004514{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515 if (*exceptionObject == NULL) {
4516 *exceptionObject = PyUnicodeTranslateError_Create(
4517 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 }
4519 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004520 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4521 goto onError;
4522 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4523 goto onError;
4524 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4525 goto onError;
4526 return;
4527 onError:
4528 Py_DECREF(*exceptionObject);
4529 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530 }
4531}
4532
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004533/* raises a UnicodeTranslateError */
4534static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004535 const Py_UNICODE *unicode, Py_ssize_t size,
4536 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537 const char *reason)
4538{
4539 make_translate_exception(exceptionObject,
4540 unicode, size, startpos, endpos, reason);
4541 if (*exceptionObject != NULL)
4542 PyCodec_StrictErrors(*exceptionObject);
4543}
4544
4545/* error handling callback helper:
4546 build arguments, call the callback and check the arguments,
4547 put the result into newpos and return the replacement string, which
4548 has to be freed by the caller */
4549static PyObject *unicode_translate_call_errorhandler(const char *errors,
4550 PyObject **errorHandler,
4551 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004552 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4553 Py_ssize_t startpos, Py_ssize_t endpos,
4554 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004556 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004558 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004559 PyObject *restuple;
4560 PyObject *resunicode;
4561
4562 if (*errorHandler == NULL) {
4563 *errorHandler = PyCodec_LookupError(errors);
4564 if (*errorHandler == NULL)
4565 return NULL;
4566 }
4567
4568 make_translate_exception(exceptionObject,
4569 unicode, size, startpos, endpos, reason);
4570 if (*exceptionObject == NULL)
4571 return NULL;
4572
4573 restuple = PyObject_CallFunctionObjArgs(
4574 *errorHandler, *exceptionObject, NULL);
4575 if (restuple == NULL)
4576 return NULL;
4577 if (!PyTuple_Check(restuple)) {
4578 PyErr_Format(PyExc_TypeError, &argparse[4]);
4579 Py_DECREF(restuple);
4580 return NULL;
4581 }
4582 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004583 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004584 Py_DECREF(restuple);
4585 return NULL;
4586 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004587 if (i_newpos<0)
4588 *newpos = size+i_newpos;
4589 else
4590 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004591 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004592 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004593 Py_DECREF(restuple);
4594 return NULL;
4595 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004596 Py_INCREF(resunicode);
4597 Py_DECREF(restuple);
4598 return resunicode;
4599}
4600
4601/* Lookup the character ch in the mapping and put the result in result,
4602 which must be decrefed by the caller.
4603 Return 0 on success, -1 on error */
4604static
4605int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4606{
4607 PyObject *w = PyInt_FromLong((long)c);
4608 PyObject *x;
4609
4610 if (w == NULL)
4611 return -1;
4612 x = PyObject_GetItem(mapping, w);
4613 Py_DECREF(w);
4614 if (x == NULL) {
4615 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4616 /* No mapping found means: use 1:1 mapping. */
4617 PyErr_Clear();
4618 *result = NULL;
4619 return 0;
4620 } else
4621 return -1;
4622 }
4623 else if (x == Py_None) {
4624 *result = x;
4625 return 0;
4626 }
4627 else if (PyInt_Check(x)) {
4628 long value = PyInt_AS_LONG(x);
4629 long max = PyUnicode_GetMax();
4630 if (value < 0 || value > max) {
4631 PyErr_Format(PyExc_TypeError,
4632 "character mapping must be in range(0x%lx)", max+1);
4633 Py_DECREF(x);
4634 return -1;
4635 }
4636 *result = x;
4637 return 0;
4638 }
4639 else if (PyUnicode_Check(x)) {
4640 *result = x;
4641 return 0;
4642 }
4643 else {
4644 /* wrong return value */
4645 PyErr_SetString(PyExc_TypeError,
4646 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004647 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004648 return -1;
4649 }
4650}
4651/* ensure that *outobj is at least requiredsize characters long,
4652if not reallocate and adjust various state variables.
4653Return 0 on success, -1 on error */
4654static
Walter Dörwald4894c302003-10-24 14:25:28 +00004655int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004656 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004658 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004659 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004660 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004661 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004662 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004663 if (requiredsize < 2 * oldsize)
4664 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004665 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004666 return -1;
4667 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004668 }
4669 return 0;
4670}
4671/* lookup the character, put the result in the output string and adjust
4672 various state variables. Return a new reference to the object that
4673 was put in the output buffer in *result, or Py_None, if the mapping was
4674 undefined (in which case no character was written).
4675 The called must decref result.
4676 Return 0 on success, -1 on error. */
4677static
Walter Dörwald4894c302003-10-24 14:25:28 +00004678int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004679 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004680 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004681{
Walter Dörwald4894c302003-10-24 14:25:28 +00004682 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004683 return -1;
4684 if (*res==NULL) {
4685 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004686 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004687 }
4688 else if (*res==Py_None)
4689 ;
4690 else if (PyInt_Check(*res)) {
4691 /* no overflow check, because we know that the space is enough */
4692 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4693 }
4694 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004695 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004696 if (repsize==1) {
4697 /* no overflow check, because we know that the space is enough */
4698 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4699 }
4700 else if (repsize!=0) {
4701 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004702 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004703 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004704 repsize - 1;
4705 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004706 return -1;
4707 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4708 *outp += repsize;
4709 }
4710 }
4711 else
4712 return -1;
4713 return 0;
4714}
4715
4716PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004717 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718 PyObject *mapping,
4719 const char *errors)
4720{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004721 /* output object */
4722 PyObject *res = NULL;
4723 /* pointers to the beginning and end+1 of input */
4724 const Py_UNICODE *startp = p;
4725 const Py_UNICODE *endp = p + size;
4726 /* pointer into the output */
4727 Py_UNICODE *str;
4728 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004729 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004730 char *reason = "character maps to <undefined>";
4731 PyObject *errorHandler = NULL;
4732 PyObject *exc = NULL;
4733 /* the following variable is used for caching string comparisons
4734 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4735 * 3=ignore, 4=xmlcharrefreplace */
4736 int known_errorHandler = -1;
4737
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738 if (mapping == NULL) {
4739 PyErr_BadArgument();
4740 return NULL;
4741 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004742
4743 /* allocate enough for a simple 1:1 translation without
4744 replacements, if we need more, we'll resize */
4745 res = PyUnicode_FromUnicode(NULL, size);
4746 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004747 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004749 return res;
4750 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004752 while (p<endp) {
4753 /* try to encode it */
4754 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004755 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004756 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757 goto onError;
4758 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004759 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004760 if (x!=Py_None) /* it worked => adjust input pointer */
4761 ++p;
4762 else { /* untranslatable character */
4763 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004764 Py_ssize_t repsize;
4765 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004766 Py_UNICODE *uni2;
4767 /* startpos for collecting untranslatable chars */
4768 const Py_UNICODE *collstart = p;
4769 const Py_UNICODE *collend = p+1;
4770 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004772 /* find all untranslatable characters */
4773 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004774 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004775 goto onError;
4776 Py_XDECREF(x);
4777 if (x!=Py_None)
4778 break;
4779 ++collend;
4780 }
4781 /* cache callback name lookup
4782 * (if not done yet, i.e. it's the first error) */
4783 if (known_errorHandler==-1) {
4784 if ((errors==NULL) || (!strcmp(errors, "strict")))
4785 known_errorHandler = 1;
4786 else if (!strcmp(errors, "replace"))
4787 known_errorHandler = 2;
4788 else if (!strcmp(errors, "ignore"))
4789 known_errorHandler = 3;
4790 else if (!strcmp(errors, "xmlcharrefreplace"))
4791 known_errorHandler = 4;
4792 else
4793 known_errorHandler = 0;
4794 }
4795 switch (known_errorHandler) {
4796 case 1: /* strict */
4797 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4798 goto onError;
4799 case 2: /* replace */
4800 /* No need to check for space, this is a 1:1 replacement */
4801 for (coll = collstart; coll<collend; ++coll)
4802 *str++ = '?';
4803 /* fall through */
4804 case 3: /* ignore */
4805 p = collend;
4806 break;
4807 case 4: /* xmlcharrefreplace */
4808 /* generate replacement (temporarily (mis)uses p) */
4809 for (p = collstart; p < collend; ++p) {
4810 char buffer[2+29+1+1];
4811 char *cp;
4812 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004813 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4815 goto onError;
4816 for (cp = buffer; *cp; ++cp)
4817 *str++ = *cp;
4818 }
4819 p = collend;
4820 break;
4821 default:
4822 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4823 reason, startp, size, &exc,
4824 collstart-startp, collend-startp, &newpos);
4825 if (repunicode == NULL)
4826 goto onError;
4827 /* generate replacement */
4828 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004829 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004830 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4831 Py_DECREF(repunicode);
4832 goto onError;
4833 }
4834 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4835 *str++ = *uni2;
4836 p = startp + newpos;
4837 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838 }
4839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004841 /* Resize if we allocated to much */
4842 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004843 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004844 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004845 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846 }
4847 Py_XDECREF(exc);
4848 Py_XDECREF(errorHandler);
4849 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004851 onError:
4852 Py_XDECREF(res);
4853 Py_XDECREF(exc);
4854 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 return NULL;
4856}
4857
4858PyObject *PyUnicode_Translate(PyObject *str,
4859 PyObject *mapping,
4860 const char *errors)
4861{
4862 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004863
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 str = PyUnicode_FromObject(str);
4865 if (str == NULL)
4866 goto onError;
4867 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4868 PyUnicode_GET_SIZE(str),
4869 mapping,
4870 errors);
4871 Py_DECREF(str);
4872 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004873
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874 onError:
4875 Py_XDECREF(str);
4876 return NULL;
4877}
Tim Petersced69f82003-09-16 20:30:58 +00004878
Guido van Rossum9e896b32000-04-05 20:11:21 +00004879/* --- Decimal Encoder ---------------------------------------------------- */
4880
4881int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004882 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004883 char *output,
4884 const char *errors)
4885{
4886 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887 PyObject *errorHandler = NULL;
4888 PyObject *exc = NULL;
4889 const char *encoding = "decimal";
4890 const char *reason = "invalid decimal Unicode string";
4891 /* the following variable is used for caching string comparisons
4892 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4893 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004894
4895 if (output == NULL) {
4896 PyErr_BadArgument();
4897 return -1;
4898 }
4899
4900 p = s;
4901 end = s + length;
4902 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004903 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004904 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004905 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004906 Py_ssize_t repsize;
4907 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004908 Py_UNICODE *uni2;
4909 Py_UNICODE *collstart;
4910 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004911
Guido van Rossum9e896b32000-04-05 20:11:21 +00004912 if (Py_UNICODE_ISSPACE(ch)) {
4913 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004914 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004915 continue;
4916 }
4917 decimal = Py_UNICODE_TODECIMAL(ch);
4918 if (decimal >= 0) {
4919 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004920 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004921 continue;
4922 }
Guido van Rossumba477042000-04-06 18:18:10 +00004923 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004924 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004925 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004926 continue;
4927 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004928 /* All other characters are considered unencodable */
4929 collstart = p;
4930 collend = p+1;
4931 while (collend < end) {
4932 if ((0 < *collend && *collend < 256) ||
4933 !Py_UNICODE_ISSPACE(*collend) ||
4934 Py_UNICODE_TODECIMAL(*collend))
4935 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004936 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004937 /* cache callback name lookup
4938 * (if not done yet, i.e. it's the first error) */
4939 if (known_errorHandler==-1) {
4940 if ((errors==NULL) || (!strcmp(errors, "strict")))
4941 known_errorHandler = 1;
4942 else if (!strcmp(errors, "replace"))
4943 known_errorHandler = 2;
4944 else if (!strcmp(errors, "ignore"))
4945 known_errorHandler = 3;
4946 else if (!strcmp(errors, "xmlcharrefreplace"))
4947 known_errorHandler = 4;
4948 else
4949 known_errorHandler = 0;
4950 }
4951 switch (known_errorHandler) {
4952 case 1: /* strict */
4953 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4954 goto onError;
4955 case 2: /* replace */
4956 for (p = collstart; p < collend; ++p)
4957 *output++ = '?';
4958 /* fall through */
4959 case 3: /* ignore */
4960 p = collend;
4961 break;
4962 case 4: /* xmlcharrefreplace */
4963 /* generate replacement (temporarily (mis)uses p) */
4964 for (p = collstart; p < collend; ++p)
4965 output += sprintf(output, "&#%d;", (int)*p);
4966 p = collend;
4967 break;
4968 default:
4969 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4970 encoding, reason, s, length, &exc,
4971 collstart-s, collend-s, &newpos);
4972 if (repunicode == NULL)
4973 goto onError;
4974 /* generate replacement */
4975 repsize = PyUnicode_GET_SIZE(repunicode);
4976 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4977 Py_UNICODE ch = *uni2;
4978 if (Py_UNICODE_ISSPACE(ch))
4979 *output++ = ' ';
4980 else {
4981 decimal = Py_UNICODE_TODECIMAL(ch);
4982 if (decimal >= 0)
4983 *output++ = '0' + decimal;
4984 else if (0 < ch && ch < 256)
4985 *output++ = (char)ch;
4986 else {
4987 Py_DECREF(repunicode);
4988 raise_encode_exception(&exc, encoding,
4989 s, length, collstart-s, collend-s, reason);
4990 goto onError;
4991 }
4992 }
4993 }
4994 p = s + newpos;
4995 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004996 }
4997 }
4998 /* 0-terminate the output string */
4999 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005000 Py_XDECREF(exc);
5001 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005002 return 0;
5003
5004 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005005 Py_XDECREF(exc);
5006 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005007 return -1;
5008}
5009
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010/* --- Helpers ------------------------------------------------------------ */
5011
Thomas Wouters477c8d52006-05-27 19:21:47 +00005012#define STRINGLIB_CHAR Py_UNICODE
5013
5014#define STRINGLIB_LEN PyUnicode_GET_SIZE
5015#define STRINGLIB_NEW PyUnicode_FromUnicode
5016#define STRINGLIB_STR PyUnicode_AS_UNICODE
5017
5018Py_LOCAL_INLINE(int)
5019STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005021 if (str[0] != other[0])
5022 return 1;
5023 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024}
5025
Thomas Wouters477c8d52006-05-27 19:21:47 +00005026#define STRINGLIB_EMPTY unicode_empty
5027
5028#include "stringlib/fastsearch.h"
5029
5030#include "stringlib/count.h"
5031#include "stringlib/find.h"
5032#include "stringlib/partition.h"
5033
5034/* helper macro to fixup start/end slice values */
5035#define FIX_START_END(obj) \
5036 if (start < 0) \
5037 start += (obj)->length; \
5038 if (start < 0) \
5039 start = 0; \
5040 if (end > (obj)->length) \
5041 end = (obj)->length; \
5042 if (end < 0) \
5043 end += (obj)->length; \
5044 if (end < 0) \
5045 end = 0;
5046
Martin v. Löwis18e16552006-02-15 17:27:45 +00005047Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005048 PyObject *substr,
5049 Py_ssize_t start,
5050 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005051{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005052 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005053 PyUnicodeObject* str_obj;
5054 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005055
Thomas Wouters477c8d52006-05-27 19:21:47 +00005056 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5057 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005059 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5060 if (!sub_obj) {
5061 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062 return -1;
5063 }
Tim Petersced69f82003-09-16 20:30:58 +00005064
Thomas Wouters477c8d52006-05-27 19:21:47 +00005065 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005066
Thomas Wouters477c8d52006-05-27 19:21:47 +00005067 result = stringlib_count(
5068 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5069 );
5070
5071 Py_DECREF(sub_obj);
5072 Py_DECREF(str_obj);
5073
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074 return result;
5075}
5076
Martin v. Löwis18e16552006-02-15 17:27:45 +00005077Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005078 PyObject *sub,
5079 Py_ssize_t start,
5080 Py_ssize_t end,
5081 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005083 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005084
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005086 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005087 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005088 sub = PyUnicode_FromObject(sub);
5089 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005090 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005091 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092 }
Tim Petersced69f82003-09-16 20:30:58 +00005093
Thomas Wouters477c8d52006-05-27 19:21:47 +00005094 if (direction > 0)
5095 result = stringlib_find_slice(
5096 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5097 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5098 start, end
5099 );
5100 else
5101 result = stringlib_rfind_slice(
5102 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5103 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5104 start, end
5105 );
5106
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005108 Py_DECREF(sub);
5109
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 return result;
5111}
5112
Tim Petersced69f82003-09-16 20:30:58 +00005113static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114int tailmatch(PyUnicodeObject *self,
5115 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005116 Py_ssize_t start,
5117 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118 int direction)
5119{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120 if (substring->length == 0)
5121 return 1;
5122
Thomas Wouters477c8d52006-05-27 19:21:47 +00005123 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124
5125 end -= substring->length;
5126 if (end < start)
5127 return 0;
5128
5129 if (direction > 0) {
5130 if (Py_UNICODE_MATCH(self, end, substring))
5131 return 1;
5132 } else {
5133 if (Py_UNICODE_MATCH(self, start, substring))
5134 return 1;
5135 }
5136
5137 return 0;
5138}
5139
Martin v. Löwis18e16552006-02-15 17:27:45 +00005140Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005142 Py_ssize_t start,
5143 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144 int direction)
5145{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005146 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005147
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148 str = PyUnicode_FromObject(str);
5149 if (str == NULL)
5150 return -1;
5151 substr = PyUnicode_FromObject(substr);
5152 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005153 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154 return -1;
5155 }
Tim Petersced69f82003-09-16 20:30:58 +00005156
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157 result = tailmatch((PyUnicodeObject *)str,
5158 (PyUnicodeObject *)substr,
5159 start, end, direction);
5160 Py_DECREF(str);
5161 Py_DECREF(substr);
5162 return result;
5163}
5164
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165/* Apply fixfct filter to the Unicode object self and return a
5166 reference to the modified object */
5167
Tim Petersced69f82003-09-16 20:30:58 +00005168static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169PyObject *fixup(PyUnicodeObject *self,
5170 int (*fixfct)(PyUnicodeObject *s))
5171{
5172
5173 PyUnicodeObject *u;
5174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005175 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176 if (u == NULL)
5177 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005178
5179 Py_UNICODE_COPY(u->str, self->str, self->length);
5180
Tim Peters7a29bd52001-09-12 03:03:31 +00005181 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 /* fixfct should return TRUE if it modified the buffer. If
5183 FALSE, return a reference to the original buffer instead
5184 (to save space, not time) */
5185 Py_INCREF(self);
5186 Py_DECREF(u);
5187 return (PyObject*) self;
5188 }
5189 return (PyObject*) u;
5190}
5191
Tim Petersced69f82003-09-16 20:30:58 +00005192static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193int fixupper(PyUnicodeObject *self)
5194{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005195 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 Py_UNICODE *s = self->str;
5197 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005198
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199 while (len-- > 0) {
5200 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005201
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 ch = Py_UNICODE_TOUPPER(*s);
5203 if (ch != *s) {
5204 status = 1;
5205 *s = ch;
5206 }
5207 s++;
5208 }
5209
5210 return status;
5211}
5212
Tim Petersced69f82003-09-16 20:30:58 +00005213static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214int fixlower(PyUnicodeObject *self)
5215{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005216 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217 Py_UNICODE *s = self->str;
5218 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005219
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 while (len-- > 0) {
5221 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005222
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223 ch = Py_UNICODE_TOLOWER(*s);
5224 if (ch != *s) {
5225 status = 1;
5226 *s = ch;
5227 }
5228 s++;
5229 }
5230
5231 return status;
5232}
5233
Tim Petersced69f82003-09-16 20:30:58 +00005234static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235int fixswapcase(PyUnicodeObject *self)
5236{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005237 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238 Py_UNICODE *s = self->str;
5239 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005240
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241 while (len-- > 0) {
5242 if (Py_UNICODE_ISUPPER(*s)) {
5243 *s = Py_UNICODE_TOLOWER(*s);
5244 status = 1;
5245 } else if (Py_UNICODE_ISLOWER(*s)) {
5246 *s = Py_UNICODE_TOUPPER(*s);
5247 status = 1;
5248 }
5249 s++;
5250 }
5251
5252 return status;
5253}
5254
Tim Petersced69f82003-09-16 20:30:58 +00005255static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256int fixcapitalize(PyUnicodeObject *self)
5257{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005258 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005259 Py_UNICODE *s = self->str;
5260 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005261
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005262 if (len == 0)
5263 return 0;
5264 if (Py_UNICODE_ISLOWER(*s)) {
5265 *s = Py_UNICODE_TOUPPER(*s);
5266 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005268 s++;
5269 while (--len > 0) {
5270 if (Py_UNICODE_ISUPPER(*s)) {
5271 *s = Py_UNICODE_TOLOWER(*s);
5272 status = 1;
5273 }
5274 s++;
5275 }
5276 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277}
5278
5279static
5280int fixtitle(PyUnicodeObject *self)
5281{
5282 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5283 register Py_UNICODE *e;
5284 int previous_is_cased;
5285
5286 /* Shortcut for single character strings */
5287 if (PyUnicode_GET_SIZE(self) == 1) {
5288 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5289 if (*p != ch) {
5290 *p = ch;
5291 return 1;
5292 }
5293 else
5294 return 0;
5295 }
Tim Petersced69f82003-09-16 20:30:58 +00005296
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 e = p + PyUnicode_GET_SIZE(self);
5298 previous_is_cased = 0;
5299 for (; p < e; p++) {
5300 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005301
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302 if (previous_is_cased)
5303 *p = Py_UNICODE_TOLOWER(ch);
5304 else
5305 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005306
5307 if (Py_UNICODE_ISLOWER(ch) ||
5308 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309 Py_UNICODE_ISTITLE(ch))
5310 previous_is_cased = 1;
5311 else
5312 previous_is_cased = 0;
5313 }
5314 return 1;
5315}
5316
Tim Peters8ce9f162004-08-27 01:49:32 +00005317PyObject *
5318PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319{
Tim Peters8ce9f162004-08-27 01:49:32 +00005320 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005321 const Py_UNICODE blank = ' ';
5322 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005323 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005324 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005325 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5326 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005327 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5328 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005329 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005330 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005331 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332
Tim Peters05eba1f2004-08-27 21:32:02 +00005333 fseq = PySequence_Fast(seq, "");
5334 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005335 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005336 }
5337
Tim Peters91879ab2004-08-27 22:35:44 +00005338 /* Grrrr. A codec may be invoked to convert str objects to
5339 * Unicode, and so it's possible to call back into Python code
5340 * during PyUnicode_FromObject(), and so it's possible for a sick
5341 * codec to change the size of fseq (if seq is a list). Therefore
5342 * we have to keep refetching the size -- can't assume seqlen
5343 * is invariant.
5344 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005345 seqlen = PySequence_Fast_GET_SIZE(fseq);
5346 /* If empty sequence, return u"". */
5347 if (seqlen == 0) {
5348 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5349 goto Done;
5350 }
5351 /* If singleton sequence with an exact Unicode, return that. */
5352 if (seqlen == 1) {
5353 item = PySequence_Fast_GET_ITEM(fseq, 0);
5354 if (PyUnicode_CheckExact(item)) {
5355 Py_INCREF(item);
5356 res = (PyUnicodeObject *)item;
5357 goto Done;
5358 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005359 }
5360
Tim Peters05eba1f2004-08-27 21:32:02 +00005361 /* At least two items to join, or one that isn't exact Unicode. */
5362 if (seqlen > 1) {
5363 /* Set up sep and seplen -- they're needed. */
5364 if (separator == NULL) {
5365 sep = &blank;
5366 seplen = 1;
5367 }
5368 else {
5369 internal_separator = PyUnicode_FromObject(separator);
5370 if (internal_separator == NULL)
5371 goto onError;
5372 sep = PyUnicode_AS_UNICODE(internal_separator);
5373 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005374 /* In case PyUnicode_FromObject() mutated seq. */
5375 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005376 }
5377 }
5378
5379 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005380 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005381 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005382 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005383 res_p = PyUnicode_AS_UNICODE(res);
5384 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005385
Tim Peters05eba1f2004-08-27 21:32:02 +00005386 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005387 Py_ssize_t itemlen;
5388 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005389
5390 item = PySequence_Fast_GET_ITEM(fseq, i);
5391 /* Convert item to Unicode. */
5392 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5393 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005394 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005395 " %.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00005396 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005397 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005398 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005399 item = PyUnicode_FromObject(item);
5400 if (item == NULL)
5401 goto onError;
5402 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005403
Tim Peters91879ab2004-08-27 22:35:44 +00005404 /* In case PyUnicode_FromObject() mutated seq. */
5405 seqlen = PySequence_Fast_GET_SIZE(fseq);
5406
Tim Peters8ce9f162004-08-27 01:49:32 +00005407 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005409 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005410 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005411 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005412 if (i < seqlen - 1) {
5413 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005414 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005415 goto Overflow;
5416 }
5417 if (new_res_used > res_alloc) {
5418 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005419 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005420 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005421 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005422 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005423 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005424 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005425 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005427 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005428 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005430
5431 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005432 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005433 res_p += itemlen;
5434 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005435 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005436 res_p += seplen;
5437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005439 res_used = new_res_used;
5440 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005441
Tim Peters05eba1f2004-08-27 21:32:02 +00005442 /* Shrink res to match the used area; this probably can't fail,
5443 * but it's cheap to check.
5444 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005445 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005446 goto onError;
5447
5448 Done:
5449 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005450 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 return (PyObject *)res;
5452
Tim Peters8ce9f162004-08-27 01:49:32 +00005453 Overflow:
5454 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005455 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005456 Py_DECREF(item);
5457 /* fall through */
5458
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005460 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005461 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005462 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463 return NULL;
5464}
5465
Tim Petersced69f82003-09-16 20:30:58 +00005466static
5467PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005468 Py_ssize_t left,
5469 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470 Py_UNICODE fill)
5471{
5472 PyUnicodeObject *u;
5473
5474 if (left < 0)
5475 left = 0;
5476 if (right < 0)
5477 right = 0;
5478
Tim Peters7a29bd52001-09-12 03:03:31 +00005479 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480 Py_INCREF(self);
5481 return self;
5482 }
5483
5484 u = _PyUnicode_New(left + self->length + right);
5485 if (u) {
5486 if (left)
5487 Py_UNICODE_FILL(u->str, fill, left);
5488 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5489 if (right)
5490 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5491 }
5492
5493 return u;
5494}
5495
5496#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005497 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 if (!str) \
5499 goto onError; \
5500 if (PyList_Append(list, str)) { \
5501 Py_DECREF(str); \
5502 goto onError; \
5503 } \
5504 else \
5505 Py_DECREF(str);
5506
5507static
5508PyObject *split_whitespace(PyUnicodeObject *self,
5509 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005510 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005512 register Py_ssize_t i;
5513 register Py_ssize_t j;
5514 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515 PyObject *str;
5516
5517 for (i = j = 0; i < len; ) {
5518 /* find a token */
5519 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5520 i++;
5521 j = i;
5522 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5523 i++;
5524 if (j < i) {
5525 if (maxcount-- <= 0)
5526 break;
5527 SPLIT_APPEND(self->str, j, i);
5528 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5529 i++;
5530 j = i;
5531 }
5532 }
5533 if (j < len) {
5534 SPLIT_APPEND(self->str, j, len);
5535 }
5536 return list;
5537
5538 onError:
5539 Py_DECREF(list);
5540 return NULL;
5541}
5542
5543PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005544 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005546 register Py_ssize_t i;
5547 register Py_ssize_t j;
5548 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 PyObject *list;
5550 PyObject *str;
5551 Py_UNICODE *data;
5552
5553 string = PyUnicode_FromObject(string);
5554 if (string == NULL)
5555 return NULL;
5556 data = PyUnicode_AS_UNICODE(string);
5557 len = PyUnicode_GET_SIZE(string);
5558
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 list = PyList_New(0);
5560 if (!list)
5561 goto onError;
5562
5563 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005564 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005565
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005567 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569
5570 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005571 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 if (i < len) {
5573 if (data[i] == '\r' && i + 1 < len &&
5574 data[i+1] == '\n')
5575 i += 2;
5576 else
5577 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005578 if (keepends)
5579 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 }
Guido van Rossum86662912000-04-11 15:38:46 +00005581 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 j = i;
5583 }
5584 if (j < len) {
5585 SPLIT_APPEND(data, j, len);
5586 }
5587
5588 Py_DECREF(string);
5589 return list;
5590
5591 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005592 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 Py_DECREF(string);
5594 return NULL;
5595}
5596
Tim Petersced69f82003-09-16 20:30:58 +00005597static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598PyObject *split_char(PyUnicodeObject *self,
5599 PyObject *list,
5600 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005601 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005603 register Py_ssize_t i;
5604 register Py_ssize_t j;
5605 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 PyObject *str;
5607
5608 for (i = j = 0; i < len; ) {
5609 if (self->str[i] == ch) {
5610 if (maxcount-- <= 0)
5611 break;
5612 SPLIT_APPEND(self->str, j, i);
5613 i = j = i + 1;
5614 } else
5615 i++;
5616 }
5617 if (j <= len) {
5618 SPLIT_APPEND(self->str, j, len);
5619 }
5620 return list;
5621
5622 onError:
5623 Py_DECREF(list);
5624 return NULL;
5625}
5626
Tim Petersced69f82003-09-16 20:30:58 +00005627static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628PyObject *split_substring(PyUnicodeObject *self,
5629 PyObject *list,
5630 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005631 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005633 register Py_ssize_t i;
5634 register Py_ssize_t j;
5635 Py_ssize_t len = self->length;
5636 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637 PyObject *str;
5638
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005639 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 if (Py_UNICODE_MATCH(self, i, substring)) {
5641 if (maxcount-- <= 0)
5642 break;
5643 SPLIT_APPEND(self->str, j, i);
5644 i = j = i + sublen;
5645 } else
5646 i++;
5647 }
5648 if (j <= len) {
5649 SPLIT_APPEND(self->str, j, len);
5650 }
5651 return list;
5652
5653 onError:
5654 Py_DECREF(list);
5655 return NULL;
5656}
5657
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005658static
5659PyObject *rsplit_whitespace(PyUnicodeObject *self,
5660 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005661 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005662{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005663 register Py_ssize_t i;
5664 register Py_ssize_t j;
5665 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005666 PyObject *str;
5667
5668 for (i = j = len - 1; i >= 0; ) {
5669 /* find a token */
5670 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5671 i--;
5672 j = i;
5673 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5674 i--;
5675 if (j > i) {
5676 if (maxcount-- <= 0)
5677 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005678 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005679 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5680 i--;
5681 j = i;
5682 }
5683 }
5684 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005685 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005686 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005687 if (PyList_Reverse(list) < 0)
5688 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005689 return list;
5690
5691 onError:
5692 Py_DECREF(list);
5693 return NULL;
5694}
5695
5696static
5697PyObject *rsplit_char(PyUnicodeObject *self,
5698 PyObject *list,
5699 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005700 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005701{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005702 register Py_ssize_t i;
5703 register Py_ssize_t j;
5704 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005705 PyObject *str;
5706
5707 for (i = j = len - 1; i >= 0; ) {
5708 if (self->str[i] == ch) {
5709 if (maxcount-- <= 0)
5710 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005711 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005712 j = i = i - 1;
5713 } else
5714 i--;
5715 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005716 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005717 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005718 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005719 if (PyList_Reverse(list) < 0)
5720 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005721 return list;
5722
5723 onError:
5724 Py_DECREF(list);
5725 return NULL;
5726}
5727
5728static
5729PyObject *rsplit_substring(PyUnicodeObject *self,
5730 PyObject *list,
5731 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005732 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005733{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005734 register Py_ssize_t i;
5735 register Py_ssize_t j;
5736 Py_ssize_t len = self->length;
5737 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005738 PyObject *str;
5739
5740 for (i = len - sublen, j = len; i >= 0; ) {
5741 if (Py_UNICODE_MATCH(self, i, substring)) {
5742 if (maxcount-- <= 0)
5743 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005744 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005745 j = i;
5746 i -= sublen;
5747 } else
5748 i--;
5749 }
5750 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005751 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005752 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005753 if (PyList_Reverse(list) < 0)
5754 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005755 return list;
5756
5757 onError:
5758 Py_DECREF(list);
5759 return NULL;
5760}
5761
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762#undef SPLIT_APPEND
5763
5764static
5765PyObject *split(PyUnicodeObject *self,
5766 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005767 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768{
5769 PyObject *list;
5770
5771 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005772 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773
5774 list = PyList_New(0);
5775 if (!list)
5776 return NULL;
5777
5778 if (substring == NULL)
5779 return split_whitespace(self,list,maxcount);
5780
5781 else if (substring->length == 1)
5782 return split_char(self,list,substring->str[0],maxcount);
5783
5784 else if (substring->length == 0) {
5785 Py_DECREF(list);
5786 PyErr_SetString(PyExc_ValueError, "empty separator");
5787 return NULL;
5788 }
5789 else
5790 return split_substring(self,list,substring,maxcount);
5791}
5792
Tim Petersced69f82003-09-16 20:30:58 +00005793static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005794PyObject *rsplit(PyUnicodeObject *self,
5795 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005796 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005797{
5798 PyObject *list;
5799
5800 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005801 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005802
5803 list = PyList_New(0);
5804 if (!list)
5805 return NULL;
5806
5807 if (substring == NULL)
5808 return rsplit_whitespace(self,list,maxcount);
5809
5810 else if (substring->length == 1)
5811 return rsplit_char(self,list,substring->str[0],maxcount);
5812
5813 else if (substring->length == 0) {
5814 Py_DECREF(list);
5815 PyErr_SetString(PyExc_ValueError, "empty separator");
5816 return NULL;
5817 }
5818 else
5819 return rsplit_substring(self,list,substring,maxcount);
5820}
5821
5822static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823PyObject *replace(PyUnicodeObject *self,
5824 PyUnicodeObject *str1,
5825 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005826 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827{
5828 PyUnicodeObject *u;
5829
5830 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005831 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832
Thomas Wouters477c8d52006-05-27 19:21:47 +00005833 if (str1->length == str2->length) {
5834 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005835 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005836 if (str1->length == 1) {
5837 /* replace characters */
5838 Py_UNICODE u1, u2;
5839 if (!findchar(self->str, self->length, str1->str[0]))
5840 goto nothing;
5841 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5842 if (!u)
5843 return NULL;
5844 Py_UNICODE_COPY(u->str, self->str, self->length);
5845 u1 = str1->str[0];
5846 u2 = str2->str[0];
5847 for (i = 0; i < u->length; i++)
5848 if (u->str[i] == u1) {
5849 if (--maxcount < 0)
5850 break;
5851 u->str[i] = u2;
5852 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005854 i = fastsearch(
5855 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005857 if (i < 0)
5858 goto nothing;
5859 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5860 if (!u)
5861 return NULL;
5862 Py_UNICODE_COPY(u->str, self->str, self->length);
5863 while (i <= self->length - str1->length)
5864 if (Py_UNICODE_MATCH(self, i, str1)) {
5865 if (--maxcount < 0)
5866 break;
5867 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5868 i += str1->length;
5869 } else
5870 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005873
5874 Py_ssize_t n, i, j, e;
5875 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 Py_UNICODE *p;
5877
5878 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005879 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880 if (n > maxcount)
5881 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005882 if (n == 0)
5883 goto nothing;
5884 /* new_size = self->length + n * (str2->length - str1->length)); */
5885 delta = (str2->length - str1->length);
5886 if (delta == 0) {
5887 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005889 product = n * (str2->length - str1->length);
5890 if ((product / (str2->length - str1->length)) != n) {
5891 PyErr_SetString(PyExc_OverflowError,
5892 "replace string is too long");
5893 return NULL;
5894 }
5895 new_size = self->length + product;
5896 if (new_size < 0) {
5897 PyErr_SetString(PyExc_OverflowError,
5898 "replace string is too long");
5899 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900 }
5901 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005902 u = _PyUnicode_New(new_size);
5903 if (!u)
5904 return NULL;
5905 i = 0;
5906 p = u->str;
5907 e = self->length - str1->length;
5908 if (str1->length > 0) {
5909 while (n-- > 0) {
5910 /* look for next match */
5911 j = i;
5912 while (j <= e) {
5913 if (Py_UNICODE_MATCH(self, j, str1))
5914 break;
5915 j++;
5916 }
5917 if (j > i) {
5918 if (j > e)
5919 break;
5920 /* copy unchanged part [i:j] */
5921 Py_UNICODE_COPY(p, self->str+i, j-i);
5922 p += j - i;
5923 }
5924 /* copy substitution string */
5925 if (str2->length > 0) {
5926 Py_UNICODE_COPY(p, str2->str, str2->length);
5927 p += str2->length;
5928 }
5929 i = j + str1->length;
5930 }
5931 if (i < self->length)
5932 /* copy tail [i:] */
5933 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5934 } else {
5935 /* interleave */
5936 while (n > 0) {
5937 Py_UNICODE_COPY(p, str2->str, str2->length);
5938 p += str2->length;
5939 if (--n <= 0)
5940 break;
5941 *p++ = self->str[i++];
5942 }
5943 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5944 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005947
5948nothing:
5949 /* nothing to replace; return original string (when possible) */
5950 if (PyUnicode_CheckExact(self)) {
5951 Py_INCREF(self);
5952 return (PyObject *) self;
5953 }
5954 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955}
5956
5957/* --- Unicode Object Methods --------------------------------------------- */
5958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005959PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960"S.title() -> unicode\n\
5961\n\
5962Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005963characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964
5965static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005966unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 return fixup(self, fixtitle);
5969}
5970
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005971PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972"S.capitalize() -> unicode\n\
5973\n\
5974Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005975have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976
5977static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005978unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 return fixup(self, fixcapitalize);
5981}
5982
5983#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005984PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985"S.capwords() -> unicode\n\
5986\n\
5987Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005988normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989
5990static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005991unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992{
5993 PyObject *list;
5994 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005995 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 /* Split into words */
5998 list = split(self, NULL, -1);
5999 if (!list)
6000 return NULL;
6001
6002 /* Capitalize each word */
6003 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6004 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6005 fixcapitalize);
6006 if (item == NULL)
6007 goto onError;
6008 Py_DECREF(PyList_GET_ITEM(list, i));
6009 PyList_SET_ITEM(list, i, item);
6010 }
6011
6012 /* Join the words to form a new string */
6013 item = PyUnicode_Join(NULL, list);
6014
6015onError:
6016 Py_DECREF(list);
6017 return (PyObject *)item;
6018}
6019#endif
6020
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006021/* Argument converter. Coerces to a single unicode character */
6022
6023static int
6024convert_uc(PyObject *obj, void *addr)
6025{
6026 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6027 PyObject *uniobj;
6028 Py_UNICODE *unistr;
6029
6030 uniobj = PyUnicode_FromObject(obj);
6031 if (uniobj == NULL) {
6032 PyErr_SetString(PyExc_TypeError,
6033 "The fill character cannot be converted to Unicode");
6034 return 0;
6035 }
6036 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6037 PyErr_SetString(PyExc_TypeError,
6038 "The fill character must be exactly one character long");
6039 Py_DECREF(uniobj);
6040 return 0;
6041 }
6042 unistr = PyUnicode_AS_UNICODE(uniobj);
6043 *fillcharloc = unistr[0];
6044 Py_DECREF(uniobj);
6045 return 1;
6046}
6047
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006048PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006049"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006051Return S centered in a Unicode string of length width. Padding is\n\
6052done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053
6054static PyObject *
6055unicode_center(PyUnicodeObject *self, PyObject *args)
6056{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006057 Py_ssize_t marg, left;
6058 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006059 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060
Thomas Woutersde017742006-02-16 19:34:37 +00006061 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 return NULL;
6063
Tim Peters7a29bd52001-09-12 03:03:31 +00006064 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065 Py_INCREF(self);
6066 return (PyObject*) self;
6067 }
6068
6069 marg = width - self->length;
6070 left = marg / 2 + (marg & width & 1);
6071
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006072 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073}
6074
Marc-André Lemburge5034372000-08-08 08:04:29 +00006075#if 0
6076
6077/* This code should go into some future Unicode collation support
6078 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006079 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006080
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006081/* speedy UTF-16 code point order comparison */
6082/* gleaned from: */
6083/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6084
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006085static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006086{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006087 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006088 0, 0, 0, 0, 0, 0, 0, 0,
6089 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006090 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006091};
6092
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093static int
6094unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6095{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006096 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006097
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 Py_UNICODE *s1 = str1->str;
6099 Py_UNICODE *s2 = str2->str;
6100
6101 len1 = str1->length;
6102 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006103
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006105 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006106
6107 c1 = *s1++;
6108 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006109
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006110 if (c1 > (1<<11) * 26)
6111 c1 += utf16Fixup[c1>>11];
6112 if (c2 > (1<<11) * 26)
6113 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006114 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006115
6116 if (c1 != c2)
6117 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006118
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006119 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 }
6121
6122 return (len1 < len2) ? -1 : (len1 != len2);
6123}
6124
Marc-André Lemburge5034372000-08-08 08:04:29 +00006125#else
6126
6127static int
6128unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6129{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006130 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006131
6132 Py_UNICODE *s1 = str1->str;
6133 Py_UNICODE *s2 = str2->str;
6134
6135 len1 = str1->length;
6136 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006137
Marc-André Lemburge5034372000-08-08 08:04:29 +00006138 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006139 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006140
Fredrik Lundh45714e92001-06-26 16:39:36 +00006141 c1 = *s1++;
6142 c2 = *s2++;
6143
6144 if (c1 != c2)
6145 return (c1 < c2) ? -1 : 1;
6146
Marc-André Lemburge5034372000-08-08 08:04:29 +00006147 len1--; len2--;
6148 }
6149
6150 return (len1 < len2) ? -1 : (len1 != len2);
6151}
6152
6153#endif
6154
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155int PyUnicode_Compare(PyObject *left,
6156 PyObject *right)
6157{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006158 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6159 return unicode_compare((PyUnicodeObject *)left,
6160 (PyUnicodeObject *)right);
6161 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
6162 (PyUnicode_Check(left) && PyString_Check(right))) {
6163 if (PyUnicode_Check(left))
6164 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
6165 if (PyUnicode_Check(right))
6166 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
6167 assert(PyString_Check(left));
6168 assert(PyString_Check(right));
6169 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006171 PyErr_Format(PyExc_TypeError,
6172 "Can't compare %.100s and %.100s",
6173 left->ob_type->tp_name,
6174 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 return -1;
6176}
6177
Martin v. Löwis5b222132007-06-10 09:51:05 +00006178int
6179PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6180{
6181 int i;
6182 Py_UNICODE *id;
6183 assert(PyUnicode_Check(uni));
6184 id = PyUnicode_AS_UNICODE(uni);
6185 /* Compare Unicode string and source character set string */
6186 for (i = 0; id[i] && str[i]; i++)
6187 if (id[i] != str[i])
6188 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6189 if (id[i])
6190 return 1; /* uni is longer */
6191 if (str[i])
6192 return -1; /* str is longer */
6193 return 0;
6194}
6195
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006196PyObject *PyUnicode_RichCompare(PyObject *left,
6197 PyObject *right,
6198 int op)
6199{
6200 int result;
6201
6202 result = PyUnicode_Compare(left, right);
6203 if (result == -1 && PyErr_Occurred())
6204 goto onError;
6205
6206 /* Convert the return value to a Boolean */
6207 switch (op) {
6208 case Py_EQ:
6209 result = (result == 0);
6210 break;
6211 case Py_NE:
6212 result = (result != 0);
6213 break;
6214 case Py_LE:
6215 result = (result <= 0);
6216 break;
6217 case Py_GE:
6218 result = (result >= 0);
6219 break;
6220 case Py_LT:
6221 result = (result == -1);
6222 break;
6223 case Py_GT:
6224 result = (result == 1);
6225 break;
6226 }
6227 return PyBool_FromLong(result);
6228
6229 onError:
6230
6231 /* Standard case
6232
6233 Type errors mean that PyUnicode_FromObject() could not convert
6234 one of the arguments (usually the right hand side) to Unicode,
6235 ie. we can't handle the comparison request. However, it is
6236 possible that the other object knows a comparison method, which
6237 is why we return Py_NotImplemented to give the other object a
6238 chance.
6239
6240 */
6241 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6242 PyErr_Clear();
6243 Py_INCREF(Py_NotImplemented);
6244 return Py_NotImplemented;
6245 }
6246 if (op != Py_EQ && op != Py_NE)
6247 return NULL;
6248
6249 /* Equality comparison.
6250
6251 This is a special case: we silence any PyExc_UnicodeDecodeError
6252 and instead turn it into a PyErr_UnicodeWarning.
6253
6254 */
6255 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6256 return NULL;
6257 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006258 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6259 (op == Py_EQ) ?
6260 "Unicode equal comparison "
6261 "failed to convert both arguments to Unicode - "
6262 "interpreting them as being unequal"
6263 :
6264 "Unicode unequal comparison "
6265 "failed to convert both arguments to Unicode - "
6266 "interpreting them as being unequal",
6267 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006268 return NULL;
6269 result = (op == Py_NE);
6270 return PyBool_FromLong(result);
6271}
6272
Guido van Rossum403d68b2000-03-13 15:55:09 +00006273int PyUnicode_Contains(PyObject *container,
6274 PyObject *element)
6275{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006276 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006277 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006278
6279 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006280 sub = PyUnicode_FromObject(element);
6281 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006282 PyErr_Format(PyExc_TypeError,
6283 "'in <string>' requires string as left operand, not %s",
6284 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006285 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006286 }
6287
Thomas Wouters477c8d52006-05-27 19:21:47 +00006288 str = PyUnicode_FromObject(container);
6289 if (!str) {
6290 Py_DECREF(sub);
6291 return -1;
6292 }
6293
6294 result = stringlib_contains_obj(str, sub);
6295
6296 Py_DECREF(str);
6297 Py_DECREF(sub);
6298
Guido van Rossum403d68b2000-03-13 15:55:09 +00006299 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006300}
6301
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302/* Concat to string or Unicode object giving a new Unicode object. */
6303
6304PyObject *PyUnicode_Concat(PyObject *left,
6305 PyObject *right)
6306{
6307 PyUnicodeObject *u = NULL, *v = NULL, *w;
6308
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006309 if (PyBytes_Check(left) || PyBytes_Check(right))
6310 return PyBytes_Concat(left, right);
6311
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 /* Coerce the two arguments */
6313 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6314 if (u == NULL)
6315 goto onError;
6316 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6317 if (v == NULL)
6318 goto onError;
6319
6320 /* Shortcuts */
6321 if (v == unicode_empty) {
6322 Py_DECREF(v);
6323 return (PyObject *)u;
6324 }
6325 if (u == unicode_empty) {
6326 Py_DECREF(u);
6327 return (PyObject *)v;
6328 }
6329
6330 /* Concat the two Unicode strings */
6331 w = _PyUnicode_New(u->length + v->length);
6332 if (w == NULL)
6333 goto onError;
6334 Py_UNICODE_COPY(w->str, u->str, u->length);
6335 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6336
6337 Py_DECREF(u);
6338 Py_DECREF(v);
6339 return (PyObject *)w;
6340
6341onError:
6342 Py_XDECREF(u);
6343 Py_XDECREF(v);
6344 return NULL;
6345}
6346
Walter Dörwald1ab83302007-05-18 17:15:44 +00006347void
6348PyUnicode_Append(PyObject **pleft, PyObject *right)
6349{
6350 PyObject *new;
6351 if (*pleft == NULL)
6352 return;
6353 if (right == NULL || !PyUnicode_Check(*pleft)) {
6354 Py_DECREF(*pleft);
6355 *pleft = NULL;
6356 return;
6357 }
6358 new = PyUnicode_Concat(*pleft, right);
6359 Py_DECREF(*pleft);
6360 *pleft = new;
6361}
6362
6363void
6364PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6365{
6366 PyUnicode_Append(pleft, right);
6367 Py_XDECREF(right);
6368}
6369
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006370PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371"S.count(sub[, start[, end]]) -> int\n\
6372\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006373Return the number of non-overlapping occurrences of substring sub in\n\
6374Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006375interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376
6377static PyObject *
6378unicode_count(PyUnicodeObject *self, PyObject *args)
6379{
6380 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006381 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006382 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383 PyObject *result;
6384
Guido van Rossumb8872e62000-05-09 14:14:27 +00006385 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6386 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387 return NULL;
6388
6389 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006390 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 if (substring == NULL)
6392 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006393
Thomas Wouters477c8d52006-05-27 19:21:47 +00006394 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395
Thomas Wouters477c8d52006-05-27 19:21:47 +00006396 result = PyInt_FromSsize_t(
6397 stringlib_count(self->str + start, end - start,
6398 substring->str, substring->length)
6399 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400
6401 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006402
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403 return result;
6404}
6405
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006406PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006407"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006409Encodes S using the codec registered for encoding. encoding defaults\n\
6410to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006411handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006412a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6413'xmlcharrefreplace' as well as any other name registered with\n\
6414codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415
6416static PyObject *
6417unicode_encode(PyUnicodeObject *self, PyObject *args)
6418{
6419 char *encoding = NULL;
6420 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006421 PyObject *v;
6422
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6424 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006425 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006426 if (v == NULL)
6427 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006428 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006429 if (PyString_Check(v)) {
6430 /* Old codec, turn it into bytes */
6431 PyObject *b = PyBytes_FromObject(v);
6432 Py_DECREF(v);
6433 return b;
6434 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006435 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006436 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006437 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006438 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006439 Py_DECREF(v);
6440 return NULL;
6441 }
6442 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006443
6444 onError:
6445 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006446}
6447
6448PyDoc_STRVAR(decode__doc__,
6449"S.decode([encoding[,errors]]) -> string or unicode\n\
6450\n\
6451Decodes S using the codec registered for encoding. encoding defaults\n\
6452to the default encoding. errors may be given to set a different error\n\
6453handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6454a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6455as well as any other name registerd with codecs.register_error that is\n\
6456able to handle UnicodeDecodeErrors.");
6457
6458static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006459unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006460{
6461 char *encoding = NULL;
6462 char *errors = NULL;
6463 PyObject *v;
6464
6465 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6466 return NULL;
6467 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006468 if (v == NULL)
6469 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006470 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6471 PyErr_Format(PyExc_TypeError,
6472 "decoder did not return a string/unicode object "
6473 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006474 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006475 Py_DECREF(v);
6476 return NULL;
6477 }
6478 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006479
6480 onError:
6481 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482}
6483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006484PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485"S.expandtabs([tabsize]) -> unicode\n\
6486\n\
6487Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006488If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489
6490static PyObject*
6491unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6492{
6493 Py_UNICODE *e;
6494 Py_UNICODE *p;
6495 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006496 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 PyUnicodeObject *u;
6498 int tabsize = 8;
6499
6500 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6501 return NULL;
6502
Thomas Wouters7e474022000-07-16 12:04:32 +00006503 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006504 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 e = self->str + self->length;
6506 for (p = self->str; p < e; p++)
6507 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006508 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006510 if (old_j > j) {
6511 PyErr_SetString(PyExc_OverflowError,
6512 "new string is too long");
6513 return NULL;
6514 }
6515 old_j = j;
6516 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 }
6518 else {
6519 j++;
6520 if (*p == '\n' || *p == '\r') {
6521 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006522 old_j = j = 0;
6523 if (i < 0) {
6524 PyErr_SetString(PyExc_OverflowError,
6525 "new string is too long");
6526 return NULL;
6527 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528 }
6529 }
6530
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006531 if ((i + j) < 0) {
6532 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6533 return NULL;
6534 }
6535
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536 /* Second pass: create output string and fill it */
6537 u = _PyUnicode_New(i + j);
6538 if (!u)
6539 return NULL;
6540
6541 j = 0;
6542 q = u->str;
6543
6544 for (p = self->str; p < e; p++)
6545 if (*p == '\t') {
6546 if (tabsize > 0) {
6547 i = tabsize - (j % tabsize);
6548 j += i;
6549 while (i--)
6550 *q++ = ' ';
6551 }
6552 }
6553 else {
6554 j++;
6555 *q++ = *p;
6556 if (*p == '\n' || *p == '\r')
6557 j = 0;
6558 }
6559
6560 return (PyObject*) u;
6561}
6562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006563PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564"S.find(sub [,start [,end]]) -> int\n\
6565\n\
6566Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006567such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568arguments start and end are interpreted as in slice notation.\n\
6569\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006570Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571
6572static PyObject *
6573unicode_find(PyUnicodeObject *self, PyObject *args)
6574{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006575 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006576 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006577 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006578 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579
Guido van Rossumb8872e62000-05-09 14:14:27 +00006580 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6581 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006583 substring = PyUnicode_FromObject(substring);
6584 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 return NULL;
6586
Thomas Wouters477c8d52006-05-27 19:21:47 +00006587 result = stringlib_find_slice(
6588 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6589 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6590 start, end
6591 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592
6593 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006594
6595 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596}
6597
6598static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006599unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600{
6601 if (index < 0 || index >= self->length) {
6602 PyErr_SetString(PyExc_IndexError, "string index out of range");
6603 return NULL;
6604 }
6605
6606 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6607}
6608
6609static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006610unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006612 /* Since Unicode objects compare equal to their UTF-8 string
6613 counterparts, we hash the UTF-8 string. */
6614 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6615 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616}
6617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006618PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619"S.index(sub [,start [,end]]) -> int\n\
6620\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006621Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622
6623static PyObject *
6624unicode_index(PyUnicodeObject *self, PyObject *args)
6625{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006626 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006627 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006628 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006629 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630
Guido van Rossumb8872e62000-05-09 14:14:27 +00006631 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6632 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006634 substring = PyUnicode_FromObject(substring);
6635 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636 return NULL;
6637
Thomas Wouters477c8d52006-05-27 19:21:47 +00006638 result = stringlib_find_slice(
6639 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6640 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6641 start, end
6642 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643
6644 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006645
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 if (result < 0) {
6647 PyErr_SetString(PyExc_ValueError, "substring not found");
6648 return NULL;
6649 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006650
Martin v. Löwis18e16552006-02-15 17:27:45 +00006651 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652}
6653
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006654PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006655"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006657Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006658at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659
6660static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006661unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662{
6663 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6664 register const Py_UNICODE *e;
6665 int cased;
6666
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 /* Shortcut for single character strings */
6668 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006669 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006671 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006672 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006673 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006674
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675 e = p + PyUnicode_GET_SIZE(self);
6676 cased = 0;
6677 for (; p < e; p++) {
6678 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006679
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006681 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 else if (!cased && Py_UNICODE_ISLOWER(ch))
6683 cased = 1;
6684 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006685 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686}
6687
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006688PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006689"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006691Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006692at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693
6694static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006695unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696{
6697 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6698 register const Py_UNICODE *e;
6699 int cased;
6700
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 /* Shortcut for single character strings */
6702 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006703 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006705 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006706 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006707 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006708
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709 e = p + PyUnicode_GET_SIZE(self);
6710 cased = 0;
6711 for (; p < e; p++) {
6712 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006713
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006715 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716 else if (!cased && Py_UNICODE_ISUPPER(ch))
6717 cased = 1;
6718 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006719 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720}
6721
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006722PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006723"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006725Return True if S is a titlecased string and there is at least one\n\
6726character in S, i.e. upper- and titlecase characters may only\n\
6727follow uncased characters and lowercase characters only cased ones.\n\
6728Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729
6730static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006731unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732{
6733 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6734 register const Py_UNICODE *e;
6735 int cased, previous_is_cased;
6736
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 /* Shortcut for single character strings */
6738 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006739 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6740 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006742 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006743 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006744 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006745
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746 e = p + PyUnicode_GET_SIZE(self);
6747 cased = 0;
6748 previous_is_cased = 0;
6749 for (; p < e; p++) {
6750 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006751
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6753 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006754 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755 previous_is_cased = 1;
6756 cased = 1;
6757 }
6758 else if (Py_UNICODE_ISLOWER(ch)) {
6759 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006760 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 previous_is_cased = 1;
6762 cased = 1;
6763 }
6764 else
6765 previous_is_cased = 0;
6766 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006767 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768}
6769
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006770PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006771"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006773Return True if all characters in S are whitespace\n\
6774and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775
6776static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006777unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778{
6779 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6780 register const Py_UNICODE *e;
6781
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782 /* Shortcut for single character strings */
6783 if (PyUnicode_GET_SIZE(self) == 1 &&
6784 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006785 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006787 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006788 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006789 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006790
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791 e = p + PyUnicode_GET_SIZE(self);
6792 for (; p < e; p++) {
6793 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006794 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006796 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797}
6798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006799PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006800"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006801\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006802Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006803and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006804
6805static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006806unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006807{
6808 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6809 register const Py_UNICODE *e;
6810
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006811 /* Shortcut for single character strings */
6812 if (PyUnicode_GET_SIZE(self) == 1 &&
6813 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006814 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006815
6816 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006817 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006818 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006819
6820 e = p + PyUnicode_GET_SIZE(self);
6821 for (; p < e; p++) {
6822 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006823 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006824 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006825 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006826}
6827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006828PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006829"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006830\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006831Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006832and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006833
6834static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006835unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006836{
6837 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6838 register const Py_UNICODE *e;
6839
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006840 /* Shortcut for single character strings */
6841 if (PyUnicode_GET_SIZE(self) == 1 &&
6842 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006843 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006844
6845 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006846 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006847 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006848
6849 e = p + PyUnicode_GET_SIZE(self);
6850 for (; p < e; p++) {
6851 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006852 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006853 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006854 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006855}
6856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006857PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006858"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006860Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006861False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862
6863static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006864unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865{
6866 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6867 register const Py_UNICODE *e;
6868
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 /* Shortcut for single character strings */
6870 if (PyUnicode_GET_SIZE(self) == 1 &&
6871 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006872 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006874 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006875 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006876 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006877
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878 e = p + PyUnicode_GET_SIZE(self);
6879 for (; p < e; p++) {
6880 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006881 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006883 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884}
6885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006886PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006887"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006889Return True if all characters in S are digits\n\
6890and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891
6892static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006893unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894{
6895 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6896 register const Py_UNICODE *e;
6897
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 /* Shortcut for single character strings */
6899 if (PyUnicode_GET_SIZE(self) == 1 &&
6900 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006901 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006903 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006904 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006905 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006906
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 e = p + PyUnicode_GET_SIZE(self);
6908 for (; p < e; p++) {
6909 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006910 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006912 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913}
6914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006915PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006916"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006918Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006919False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920
6921static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006922unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923{
6924 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6925 register const Py_UNICODE *e;
6926
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927 /* Shortcut for single character strings */
6928 if (PyUnicode_GET_SIZE(self) == 1 &&
6929 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006930 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006932 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006933 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006934 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006935
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936 e = p + PyUnicode_GET_SIZE(self);
6937 for (; p < e; p++) {
6938 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006939 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006941 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942}
6943
Martin v. Löwis47383402007-08-15 07:32:56 +00006944int
6945PyUnicode_IsIdentifier(PyObject *self)
6946{
6947 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6948 register const Py_UNICODE *e;
6949
6950 /* Special case for empty strings */
6951 if (PyUnicode_GET_SIZE(self) == 0)
6952 return 0;
6953
6954 /* PEP 3131 says that the first character must be in
6955 XID_Start and subsequent characters in XID_Continue,
6956 and for the ASCII range, the 2.x rules apply (i.e
6957 start with letters and underscore, continue with
6958 letters, digits, underscore). However, given the current
6959 definition of XID_Start and XID_Continue, it is sufficient
6960 to check just for these, except that _ must be allowed
6961 as starting an identifier. */
6962 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
6963 return 0;
6964
6965 e = p + PyUnicode_GET_SIZE(self);
6966 for (p++; p < e; p++) {
6967 if (!_PyUnicode_IsXidContinue(*p))
6968 return 0;
6969 }
6970 return 1;
6971}
6972
6973PyDoc_STRVAR(isidentifier__doc__,
6974"S.isidentifier() -> bool\n\
6975\n\
6976Return True if S is a valid identifier according\n\
6977to the language definition.");
6978
6979static PyObject*
6980unicode_isidentifier(PyObject *self)
6981{
6982 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
6983}
6984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006985PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986"S.join(sequence) -> unicode\n\
6987\n\
6988Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006989sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990
6991static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006992unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006994 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995}
6996
Martin v. Löwis18e16552006-02-15 17:27:45 +00006997static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998unicode_length(PyUnicodeObject *self)
6999{
7000 return self->length;
7001}
7002
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007003PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007004"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005\n\
7006Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007007done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008
7009static PyObject *
7010unicode_ljust(PyUnicodeObject *self, PyObject *args)
7011{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007012 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007013 Py_UNICODE fillchar = ' ';
7014
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007015 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016 return NULL;
7017
Tim Peters7a29bd52001-09-12 03:03:31 +00007018 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 Py_INCREF(self);
7020 return (PyObject*) self;
7021 }
7022
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007023 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024}
7025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007026PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027"S.lower() -> unicode\n\
7028\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007029Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030
7031static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007032unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034 return fixup(self, fixlower);
7035}
7036
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007037#define LEFTSTRIP 0
7038#define RIGHTSTRIP 1
7039#define BOTHSTRIP 2
7040
7041/* Arrays indexed by above */
7042static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7043
7044#define STRIPNAME(i) (stripformat[i]+3)
7045
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007046/* externally visible for str.strip(unicode) */
7047PyObject *
7048_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7049{
7050 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007051 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007052 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007053 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7054 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007055
Thomas Wouters477c8d52006-05-27 19:21:47 +00007056 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7057
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007058 i = 0;
7059 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007060 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7061 i++;
7062 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007063 }
7064
7065 j = len;
7066 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007067 do {
7068 j--;
7069 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7070 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007071 }
7072
7073 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007074 Py_INCREF(self);
7075 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007076 }
7077 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007078 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007079}
7080
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081
7082static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007083do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007085 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007086 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007087
7088 i = 0;
7089 if (striptype != RIGHTSTRIP) {
7090 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7091 i++;
7092 }
7093 }
7094
7095 j = len;
7096 if (striptype != LEFTSTRIP) {
7097 do {
7098 j--;
7099 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7100 j++;
7101 }
7102
7103 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7104 Py_INCREF(self);
7105 return (PyObject*)self;
7106 }
7107 else
7108 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109}
7110
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007111
7112static PyObject *
7113do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7114{
7115 PyObject *sep = NULL;
7116
7117 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7118 return NULL;
7119
7120 if (sep != NULL && sep != Py_None) {
7121 if (PyUnicode_Check(sep))
7122 return _PyUnicode_XStrip(self, striptype, sep);
7123 else if (PyString_Check(sep)) {
7124 PyObject *res;
7125 sep = PyUnicode_FromObject(sep);
7126 if (sep==NULL)
7127 return NULL;
7128 res = _PyUnicode_XStrip(self, striptype, sep);
7129 Py_DECREF(sep);
7130 return res;
7131 }
7132 else {
7133 PyErr_Format(PyExc_TypeError,
7134 "%s arg must be None, unicode or str",
7135 STRIPNAME(striptype));
7136 return NULL;
7137 }
7138 }
7139
7140 return do_strip(self, striptype);
7141}
7142
7143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007144PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007145"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007146\n\
7147Return a copy of the string S with leading and trailing\n\
7148whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007149If chars is given and not None, remove characters in chars instead.\n\
7150If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007151
7152static PyObject *
7153unicode_strip(PyUnicodeObject *self, PyObject *args)
7154{
7155 if (PyTuple_GET_SIZE(args) == 0)
7156 return do_strip(self, BOTHSTRIP); /* Common case */
7157 else
7158 return do_argstrip(self, BOTHSTRIP, args);
7159}
7160
7161
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007162PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007163"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007164\n\
7165Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007166If chars is given and not None, remove characters in chars instead.\n\
7167If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007168
7169static PyObject *
7170unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7171{
7172 if (PyTuple_GET_SIZE(args) == 0)
7173 return do_strip(self, LEFTSTRIP); /* Common case */
7174 else
7175 return do_argstrip(self, LEFTSTRIP, args);
7176}
7177
7178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007179PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007180"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007181\n\
7182Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007183If chars is given and not None, remove characters in chars instead.\n\
7184If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007185
7186static PyObject *
7187unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7188{
7189 if (PyTuple_GET_SIZE(args) == 0)
7190 return do_strip(self, RIGHTSTRIP); /* Common case */
7191 else
7192 return do_argstrip(self, RIGHTSTRIP, args);
7193}
7194
7195
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007197unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198{
7199 PyUnicodeObject *u;
7200 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007201 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007202 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203
7204 if (len < 0)
7205 len = 0;
7206
Tim Peters7a29bd52001-09-12 03:03:31 +00007207 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208 /* no repeat, return original string */
7209 Py_INCREF(str);
7210 return (PyObject*) str;
7211 }
Tim Peters8f422462000-09-09 06:13:41 +00007212
7213 /* ensure # of chars needed doesn't overflow int and # of bytes
7214 * needed doesn't overflow size_t
7215 */
7216 nchars = len * str->length;
7217 if (len && nchars / len != str->length) {
7218 PyErr_SetString(PyExc_OverflowError,
7219 "repeated string is too long");
7220 return NULL;
7221 }
7222 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7223 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7224 PyErr_SetString(PyExc_OverflowError,
7225 "repeated string is too long");
7226 return NULL;
7227 }
7228 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229 if (!u)
7230 return NULL;
7231
7232 p = u->str;
7233
Thomas Wouters477c8d52006-05-27 19:21:47 +00007234 if (str->length == 1 && len > 0) {
7235 Py_UNICODE_FILL(p, str->str[0], len);
7236 } else {
7237 Py_ssize_t done = 0; /* number of characters copied this far */
7238 if (done < nchars) {
7239 Py_UNICODE_COPY(p, str->str, str->length);
7240 done = str->length;
7241 }
7242 while (done < nchars) {
7243 int n = (done <= nchars-done) ? done : nchars-done;
7244 Py_UNICODE_COPY(p+done, p, n);
7245 done += n;
7246 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247 }
7248
7249 return (PyObject*) u;
7250}
7251
7252PyObject *PyUnicode_Replace(PyObject *obj,
7253 PyObject *subobj,
7254 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007255 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256{
7257 PyObject *self;
7258 PyObject *str1;
7259 PyObject *str2;
7260 PyObject *result;
7261
7262 self = PyUnicode_FromObject(obj);
7263 if (self == NULL)
7264 return NULL;
7265 str1 = PyUnicode_FromObject(subobj);
7266 if (str1 == NULL) {
7267 Py_DECREF(self);
7268 return NULL;
7269 }
7270 str2 = PyUnicode_FromObject(replobj);
7271 if (str2 == NULL) {
7272 Py_DECREF(self);
7273 Py_DECREF(str1);
7274 return NULL;
7275 }
Tim Petersced69f82003-09-16 20:30:58 +00007276 result = replace((PyUnicodeObject *)self,
7277 (PyUnicodeObject *)str1,
7278 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279 maxcount);
7280 Py_DECREF(self);
7281 Py_DECREF(str1);
7282 Py_DECREF(str2);
7283 return result;
7284}
7285
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007286PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287"S.replace (old, new[, maxsplit]) -> unicode\n\
7288\n\
7289Return a copy of S with all occurrences of substring\n\
7290old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007291given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292
7293static PyObject*
7294unicode_replace(PyUnicodeObject *self, PyObject *args)
7295{
7296 PyUnicodeObject *str1;
7297 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007298 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007299 PyObject *result;
7300
Martin v. Löwis18e16552006-02-15 17:27:45 +00007301 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 return NULL;
7303 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7304 if (str1 == NULL)
7305 return NULL;
7306 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007307 if (str2 == NULL) {
7308 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007310 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311
7312 result = replace(self, str1, str2, maxcount);
7313
7314 Py_DECREF(str1);
7315 Py_DECREF(str2);
7316 return result;
7317}
7318
7319static
7320PyObject *unicode_repr(PyObject *unicode)
7321{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007322 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007323 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007324 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7325 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7326
7327 /* XXX(nnorwitz): rather than over-allocating, it would be
7328 better to choose a different scheme. Perhaps scan the
7329 first N-chars of the string and allocate based on that size.
7330 */
7331 /* Initial allocation is based on the longest-possible unichr
7332 escape.
7333
7334 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7335 unichr, so in this case it's the longest unichr escape. In
7336 narrow (UTF-16) builds this is five chars per source unichr
7337 since there are two unichrs in the surrogate pair, so in narrow
7338 (UTF-16) builds it's not the longest unichr escape.
7339
7340 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7341 so in the narrow (UTF-16) build case it's the longest unichr
7342 escape.
7343 */
7344
Walter Dörwald1ab83302007-05-18 17:15:44 +00007345 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007346 2 /* quotes */
7347#ifdef Py_UNICODE_WIDE
7348 + 10*size
7349#else
7350 + 6*size
7351#endif
7352 + 1);
7353 if (repr == NULL)
7354 return NULL;
7355
Walter Dörwald1ab83302007-05-18 17:15:44 +00007356 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007357
7358 /* Add quote */
7359 *p++ = (findchar(s, size, '\'') &&
7360 !findchar(s, size, '"')) ? '"' : '\'';
7361 while (size-- > 0) {
7362 Py_UNICODE ch = *s++;
7363
7364 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007365 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007366 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007367 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007368 continue;
7369 }
7370
7371#ifdef Py_UNICODE_WIDE
7372 /* Map 21-bit characters to '\U00xxxxxx' */
7373 else if (ch >= 0x10000) {
7374 *p++ = '\\';
7375 *p++ = 'U';
7376 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7377 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7378 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7379 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7380 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7381 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7382 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7383 *p++ = hexdigits[ch & 0x0000000F];
7384 continue;
7385 }
7386#else
7387 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7388 else if (ch >= 0xD800 && ch < 0xDC00) {
7389 Py_UNICODE ch2;
7390 Py_UCS4 ucs;
7391
7392 ch2 = *s++;
7393 size--;
7394 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7395 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7396 *p++ = '\\';
7397 *p++ = 'U';
7398 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7399 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7400 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7401 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7402 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7403 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7404 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7405 *p++ = hexdigits[ucs & 0x0000000F];
7406 continue;
7407 }
7408 /* Fall through: isolated surrogates are copied as-is */
7409 s--;
7410 size++;
7411 }
7412#endif
7413
7414 /* Map 16-bit characters to '\uxxxx' */
7415 if (ch >= 256) {
7416 *p++ = '\\';
7417 *p++ = 'u';
7418 *p++ = hexdigits[(ch >> 12) & 0x000F];
7419 *p++ = hexdigits[(ch >> 8) & 0x000F];
7420 *p++ = hexdigits[(ch >> 4) & 0x000F];
7421 *p++ = hexdigits[ch & 0x000F];
7422 }
7423
7424 /* Map special whitespace to '\t', \n', '\r' */
7425 else if (ch == '\t') {
7426 *p++ = '\\';
7427 *p++ = 't';
7428 }
7429 else if (ch == '\n') {
7430 *p++ = '\\';
7431 *p++ = 'n';
7432 }
7433 else if (ch == '\r') {
7434 *p++ = '\\';
7435 *p++ = 'r';
7436 }
7437
7438 /* Map non-printable US ASCII to '\xhh' */
7439 else if (ch < ' ' || ch >= 0x7F) {
7440 *p++ = '\\';
7441 *p++ = 'x';
7442 *p++ = hexdigits[(ch >> 4) & 0x000F];
7443 *p++ = hexdigits[ch & 0x000F];
7444 }
7445
7446 /* Copy everything else as-is */
7447 else
7448 *p++ = (char) ch;
7449 }
7450 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007451 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007452
7453 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007454 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007455 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456}
7457
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007458PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459"S.rfind(sub [,start [,end]]) -> int\n\
7460\n\
7461Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007462such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463arguments start and end are interpreted as in slice notation.\n\
7464\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007465Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466
7467static PyObject *
7468unicode_rfind(PyUnicodeObject *self, PyObject *args)
7469{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007470 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007471 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007472 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007473 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474
Guido van Rossumb8872e62000-05-09 14:14:27 +00007475 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7476 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007478 substring = PyUnicode_FromObject(substring);
7479 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480 return NULL;
7481
Thomas Wouters477c8d52006-05-27 19:21:47 +00007482 result = stringlib_rfind_slice(
7483 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7484 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7485 start, end
7486 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487
7488 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007489
7490 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491}
7492
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007493PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494"S.rindex(sub [,start [,end]]) -> int\n\
7495\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007496Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497
7498static PyObject *
7499unicode_rindex(PyUnicodeObject *self, PyObject *args)
7500{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007501 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007502 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007503 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007504 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505
Guido van Rossumb8872e62000-05-09 14:14:27 +00007506 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7507 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007509 substring = PyUnicode_FromObject(substring);
7510 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511 return NULL;
7512
Thomas Wouters477c8d52006-05-27 19:21:47 +00007513 result = stringlib_rfind_slice(
7514 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7515 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7516 start, end
7517 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518
7519 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007520
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521 if (result < 0) {
7522 PyErr_SetString(PyExc_ValueError, "substring not found");
7523 return NULL;
7524 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007525 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526}
7527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007528PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007529"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530\n\
7531Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007532done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533
7534static PyObject *
7535unicode_rjust(PyUnicodeObject *self, PyObject *args)
7536{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007537 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007538 Py_UNICODE fillchar = ' ';
7539
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007540 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541 return NULL;
7542
Tim Peters7a29bd52001-09-12 03:03:31 +00007543 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544 Py_INCREF(self);
7545 return (PyObject*) self;
7546 }
7547
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007548 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549}
7550
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007552unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553{
7554 /* standard clamping */
7555 if (start < 0)
7556 start = 0;
7557 if (end < 0)
7558 end = 0;
7559 if (end > self->length)
7560 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007561 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562 /* full slice, return original string */
7563 Py_INCREF(self);
7564 return (PyObject*) self;
7565 }
7566 if (start > end)
7567 start = end;
7568 /* copy slice */
7569 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7570 end - start);
7571}
7572
7573PyObject *PyUnicode_Split(PyObject *s,
7574 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007575 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576{
7577 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007578
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579 s = PyUnicode_FromObject(s);
7580 if (s == NULL)
7581 return NULL;
7582 if (sep != NULL) {
7583 sep = PyUnicode_FromObject(sep);
7584 if (sep == NULL) {
7585 Py_DECREF(s);
7586 return NULL;
7587 }
7588 }
7589
7590 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7591
7592 Py_DECREF(s);
7593 Py_XDECREF(sep);
7594 return result;
7595}
7596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007597PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598"S.split([sep [,maxsplit]]) -> list of strings\n\
7599\n\
7600Return a list of the words in S, using sep as the\n\
7601delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007602splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007603any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604
7605static PyObject*
7606unicode_split(PyUnicodeObject *self, PyObject *args)
7607{
7608 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007609 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610
Martin v. Löwis18e16552006-02-15 17:27:45 +00007611 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612 return NULL;
7613
7614 if (substring == Py_None)
7615 return split(self, NULL, maxcount);
7616 else if (PyUnicode_Check(substring))
7617 return split(self, (PyUnicodeObject *)substring, maxcount);
7618 else
7619 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7620}
7621
Thomas Wouters477c8d52006-05-27 19:21:47 +00007622PyObject *
7623PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7624{
7625 PyObject* str_obj;
7626 PyObject* sep_obj;
7627 PyObject* out;
7628
7629 str_obj = PyUnicode_FromObject(str_in);
7630 if (!str_obj)
7631 return NULL;
7632 sep_obj = PyUnicode_FromObject(sep_in);
7633 if (!sep_obj) {
7634 Py_DECREF(str_obj);
7635 return NULL;
7636 }
7637
7638 out = stringlib_partition(
7639 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7640 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7641 );
7642
7643 Py_DECREF(sep_obj);
7644 Py_DECREF(str_obj);
7645
7646 return out;
7647}
7648
7649
7650PyObject *
7651PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7652{
7653 PyObject* str_obj;
7654 PyObject* sep_obj;
7655 PyObject* out;
7656
7657 str_obj = PyUnicode_FromObject(str_in);
7658 if (!str_obj)
7659 return NULL;
7660 sep_obj = PyUnicode_FromObject(sep_in);
7661 if (!sep_obj) {
7662 Py_DECREF(str_obj);
7663 return NULL;
7664 }
7665
7666 out = stringlib_rpartition(
7667 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7668 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7669 );
7670
7671 Py_DECREF(sep_obj);
7672 Py_DECREF(str_obj);
7673
7674 return out;
7675}
7676
7677PyDoc_STRVAR(partition__doc__,
7678"S.partition(sep) -> (head, sep, tail)\n\
7679\n\
7680Searches for the separator sep in S, and returns the part before it,\n\
7681the separator itself, and the part after it. If the separator is not\n\
7682found, returns S and two empty strings.");
7683
7684static PyObject*
7685unicode_partition(PyUnicodeObject *self, PyObject *separator)
7686{
7687 return PyUnicode_Partition((PyObject *)self, separator);
7688}
7689
7690PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007691"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007692\n\
7693Searches for the separator sep in S, starting at the end of S, and returns\n\
7694the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007695separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007696
7697static PyObject*
7698unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7699{
7700 return PyUnicode_RPartition((PyObject *)self, separator);
7701}
7702
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007703PyObject *PyUnicode_RSplit(PyObject *s,
7704 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007705 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007706{
7707 PyObject *result;
7708
7709 s = PyUnicode_FromObject(s);
7710 if (s == NULL)
7711 return NULL;
7712 if (sep != NULL) {
7713 sep = PyUnicode_FromObject(sep);
7714 if (sep == NULL) {
7715 Py_DECREF(s);
7716 return NULL;
7717 }
7718 }
7719
7720 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7721
7722 Py_DECREF(s);
7723 Py_XDECREF(sep);
7724 return result;
7725}
7726
7727PyDoc_STRVAR(rsplit__doc__,
7728"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7729\n\
7730Return a list of the words in S, using sep as the\n\
7731delimiter string, starting at the end of the string and\n\
7732working to the front. If maxsplit is given, at most maxsplit\n\
7733splits are done. If sep is not specified, any whitespace string\n\
7734is a separator.");
7735
7736static PyObject*
7737unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7738{
7739 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007740 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007741
Martin v. Löwis18e16552006-02-15 17:27:45 +00007742 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007743 return NULL;
7744
7745 if (substring == Py_None)
7746 return rsplit(self, NULL, maxcount);
7747 else if (PyUnicode_Check(substring))
7748 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7749 else
7750 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7751}
7752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007753PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007754"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755\n\
7756Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007757Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007758is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759
7760static PyObject*
7761unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7762{
Guido van Rossum86662912000-04-11 15:38:46 +00007763 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764
Guido van Rossum86662912000-04-11 15:38:46 +00007765 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766 return NULL;
7767
Guido van Rossum86662912000-04-11 15:38:46 +00007768 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769}
7770
7771static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007772PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773{
Walter Dörwald346737f2007-05-31 10:44:43 +00007774 if (PyUnicode_CheckExact(self)) {
7775 Py_INCREF(self);
7776 return self;
7777 } else
7778 /* Subtype -- return genuine unicode string with the same value. */
7779 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7780 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781}
7782
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007783PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784"S.swapcase() -> unicode\n\
7785\n\
7786Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007787and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788
7789static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007790unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792 return fixup(self, fixswapcase);
7793}
7794
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007795PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796"S.translate(table) -> unicode\n\
7797\n\
7798Return a copy of the string S, where all characters have been mapped\n\
7799through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007800Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7801Unmapped characters are left untouched. Characters mapped to None\n\
7802are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803
7804static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007805unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806{
Tim Petersced69f82003-09-16 20:30:58 +00007807 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007809 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810 "ignore");
7811}
7812
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007813PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814"S.upper() -> unicode\n\
7815\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007816Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817
7818static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007819unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007820{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821 return fixup(self, fixupper);
7822}
7823
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007824PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825"S.zfill(width) -> unicode\n\
7826\n\
7827Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007828of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829
7830static PyObject *
7831unicode_zfill(PyUnicodeObject *self, PyObject *args)
7832{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007833 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834 PyUnicodeObject *u;
7835
Martin v. Löwis18e16552006-02-15 17:27:45 +00007836 Py_ssize_t width;
7837 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 return NULL;
7839
7840 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007841 if (PyUnicode_CheckExact(self)) {
7842 Py_INCREF(self);
7843 return (PyObject*) self;
7844 }
7845 else
7846 return PyUnicode_FromUnicode(
7847 PyUnicode_AS_UNICODE(self),
7848 PyUnicode_GET_SIZE(self)
7849 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007850 }
7851
7852 fill = width - self->length;
7853
7854 u = pad(self, fill, 0, '0');
7855
Walter Dörwald068325e2002-04-15 13:36:47 +00007856 if (u == NULL)
7857 return NULL;
7858
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859 if (u->str[fill] == '+' || u->str[fill] == '-') {
7860 /* move sign to beginning of string */
7861 u->str[0] = u->str[fill];
7862 u->str[fill] = '0';
7863 }
7864
7865 return (PyObject*) u;
7866}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867
7868#if 0
7869static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007870unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872 return PyInt_FromLong(unicode_freelist_size);
7873}
7874#endif
7875
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007876PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007877"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007879Return True if S starts with the specified prefix, False otherwise.\n\
7880With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007881With optional end, stop comparing S at that position.\n\
7882prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883
7884static PyObject *
7885unicode_startswith(PyUnicodeObject *self,
7886 PyObject *args)
7887{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007888 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007889 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007890 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007891 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007892 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007893
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007894 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007895 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007896 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007897 if (PyTuple_Check(subobj)) {
7898 Py_ssize_t i;
7899 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7900 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7901 PyTuple_GET_ITEM(subobj, i));
7902 if (substring == NULL)
7903 return NULL;
7904 result = tailmatch(self, substring, start, end, -1);
7905 Py_DECREF(substring);
7906 if (result) {
7907 Py_RETURN_TRUE;
7908 }
7909 }
7910 /* nothing matched */
7911 Py_RETURN_FALSE;
7912 }
7913 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007914 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007915 return NULL;
7916 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007918 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919}
7920
7921
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007922PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007923"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007924\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007925Return True if S ends with the specified suffix, False otherwise.\n\
7926With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007927With optional end, stop comparing S at that position.\n\
7928suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007929
7930static PyObject *
7931unicode_endswith(PyUnicodeObject *self,
7932 PyObject *args)
7933{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007934 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007935 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007936 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007937 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007938 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007939
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007940 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7941 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007942 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007943 if (PyTuple_Check(subobj)) {
7944 Py_ssize_t i;
7945 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7946 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7947 PyTuple_GET_ITEM(subobj, i));
7948 if (substring == NULL)
7949 return NULL;
7950 result = tailmatch(self, substring, start, end, +1);
7951 Py_DECREF(substring);
7952 if (result) {
7953 Py_RETURN_TRUE;
7954 }
7955 }
7956 Py_RETURN_FALSE;
7957 }
7958 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007960 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007962 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007964 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965}
7966
7967
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007968
7969static PyObject *
7970unicode_getnewargs(PyUnicodeObject *v)
7971{
7972 return Py_BuildValue("(u#)", v->str, v->length);
7973}
7974
7975
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976static PyMethodDef unicode_methods[] = {
7977
7978 /* Order is according to common usage: often used methods should
7979 appear first, since lookup is done sequentially. */
7980
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007981 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7982 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7983 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007984 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007985 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7986 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7987 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7988 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7989 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7990 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7991 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007992 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007993 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7994 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7995 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007996 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007997 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007998/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7999 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8000 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8001 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008002 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008003 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008004 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008005 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008006 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8007 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8008 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8009 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8010 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8011 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8012 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8013 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8014 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8015 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8016 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8017 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8018 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8019 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008020 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008021 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008022#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008023 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024#endif
8025
8026#if 0
8027 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008028 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029#endif
8030
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008031 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 {NULL, NULL}
8033};
8034
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008035static PyObject *
8036unicode_mod(PyObject *v, PyObject *w)
8037{
8038 if (!PyUnicode_Check(v)) {
8039 Py_INCREF(Py_NotImplemented);
8040 return Py_NotImplemented;
8041 }
8042 return PyUnicode_Format(v, w);
8043}
8044
8045static PyNumberMethods unicode_as_number = {
8046 0, /*nb_add*/
8047 0, /*nb_subtract*/
8048 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008049 unicode_mod, /*nb_remainder*/
8050};
8051
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008053 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008054 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008055 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8056 (ssizeargfunc) unicode_getitem, /* sq_item */
8057 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 0, /* sq_ass_item */
8059 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008060 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061};
8062
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008063static PyObject*
8064unicode_subscript(PyUnicodeObject* self, PyObject* item)
8065{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008066 if (PyIndex_Check(item)) {
8067 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008068 if (i == -1 && PyErr_Occurred())
8069 return NULL;
8070 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008071 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008072 return unicode_getitem(self, i);
8073 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008074 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008075 Py_UNICODE* source_buf;
8076 Py_UNICODE* result_buf;
8077 PyObject* result;
8078
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008079 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008080 &start, &stop, &step, &slicelength) < 0) {
8081 return NULL;
8082 }
8083
8084 if (slicelength <= 0) {
8085 return PyUnicode_FromUnicode(NULL, 0);
8086 } else {
8087 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008088 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8089 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008090
8091 if (result_buf == NULL)
8092 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008093
8094 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8095 result_buf[i] = source_buf[cur];
8096 }
Tim Petersced69f82003-09-16 20:30:58 +00008097
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008098 result = PyUnicode_FromUnicode(result_buf, slicelength);
8099 PyMem_FREE(result_buf);
8100 return result;
8101 }
8102 } else {
8103 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8104 return NULL;
8105 }
8106}
8107
8108static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008109 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008110 (binaryfunc)unicode_subscript, /* mp_subscript */
8111 (objobjargproc)0, /* mp_ass_subscript */
8112};
8113
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114
8115static int
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008116unicode_buffer_getbuffer(PyUnicodeObject *self, PyBuffer *view, int flags)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008119 if (flags & PyBUF_CHARACTER) {
8120 PyObject *str;
8121
8122 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8123 if (str == NULL) return -1;
8124 return PyBuffer_FillInfo(view, (void *)PyString_AS_STRING(str),
8125 PyString_GET_SIZE(str), 1, flags);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126 }
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008127 else {
8128 return PyBuffer_FillInfo(view, (void *)self->str,
8129 PyUnicode_GET_DATA_SIZE(self), 1, flags);
8130 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131}
8132
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008133
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134/* Helpers for PyUnicode_Format() */
8135
8136static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008137getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008139 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140 if (argidx < arglen) {
8141 (*p_argidx)++;
8142 if (arglen < 0)
8143 return args;
8144 else
8145 return PyTuple_GetItem(args, argidx);
8146 }
8147 PyErr_SetString(PyExc_TypeError,
8148 "not enough arguments for format string");
8149 return NULL;
8150}
8151
8152#define F_LJUST (1<<0)
8153#define F_SIGN (1<<1)
8154#define F_BLANK (1<<2)
8155#define F_ALT (1<<3)
8156#define F_ZERO (1<<4)
8157
Martin v. Löwis18e16552006-02-15 17:27:45 +00008158static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008159strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008161 register Py_ssize_t i;
8162 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163 for (i = len - 1; i >= 0; i--)
8164 buffer[i] = (Py_UNICODE) charbuffer[i];
8165
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166 return len;
8167}
8168
Neal Norwitzfc76d632006-01-10 06:03:13 +00008169static int
8170doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8171{
Tim Peters15231542006-02-16 01:08:01 +00008172 Py_ssize_t result;
8173
Neal Norwitzfc76d632006-01-10 06:03:13 +00008174 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008175 result = strtounicode(buffer, (char *)buffer);
8176 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008177}
8178
8179static int
8180longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8181{
Tim Peters15231542006-02-16 01:08:01 +00008182 Py_ssize_t result;
8183
Neal Norwitzfc76d632006-01-10 06:03:13 +00008184 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008185 result = strtounicode(buffer, (char *)buffer);
8186 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008187}
8188
Guido van Rossum078151d2002-08-11 04:24:12 +00008189/* XXX To save some code duplication, formatfloat/long/int could have been
8190 shared with stringobject.c, converting from 8-bit to Unicode after the
8191 formatting is done. */
8192
Guido van Rossumd57fd912000-03-10 22:53:23 +00008193static int
8194formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008195 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196 int flags,
8197 int prec,
8198 int type,
8199 PyObject *v)
8200{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008201 /* fmt = '%#.' + `prec` + `type`
8202 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203 char fmt[20];
8204 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008205
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206 x = PyFloat_AsDouble(v);
8207 if (x == -1.0 && PyErr_Occurred())
8208 return -1;
8209 if (prec < 0)
8210 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8212 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008213 /* Worst case length calc to ensure no buffer overrun:
8214
8215 'g' formats:
8216 fmt = %#.<prec>g
8217 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8218 for any double rep.)
8219 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8220
8221 'f' formats:
8222 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8223 len = 1 + 50 + 1 + prec = 52 + prec
8224
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008225 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008226 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008227
8228 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008229 if (((type == 'g' || type == 'G') &&
8230 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008231 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008232 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008233 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008234 return -1;
8235 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008236 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8237 (flags&F_ALT) ? "#" : "",
8238 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008239 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008240}
8241
Tim Peters38fd5b62000-09-21 05:43:11 +00008242static PyObject*
8243formatlong(PyObject *val, int flags, int prec, int type)
8244{
8245 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008246 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008247 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008248 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008249
8250 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8251 if (!str)
8252 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008253 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008254 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008255 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008256}
8257
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258static int
8259formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008260 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261 int flags,
8262 int prec,
8263 int type,
8264 PyObject *v)
8265{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008266 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008267 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8268 * + 1 + 1
8269 * = 24
8270 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008271 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008272 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273 long x;
8274
8275 x = PyInt_AsLong(v);
8276 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008277 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008278 if (x < 0 && type == 'u') {
8279 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008280 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008281 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8282 sign = "-";
8283 else
8284 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008286 prec = 1;
8287
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008288 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8289 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008290 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008291 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008292 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008293 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008294 return -1;
8295 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008296
8297 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008298 (type == 'x' || type == 'X' || type == 'o')) {
8299 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008300 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008301 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008302 * - when 0 is being converted, the C standard leaves off
8303 * the '0x' or '0X', which is inconsistent with other
8304 * %#x/%#X conversions and inconsistent with Python's
8305 * hex() function
8306 * - there are platforms that violate the standard and
8307 * convert 0 with the '0x' or '0X'
8308 * (Metrowerks, Compaq Tru64)
8309 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008310 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008311 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008312 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008313 * We can achieve the desired consistency by inserting our
8314 * own '0x' or '0X' prefix, and substituting %x/%X in place
8315 * of %#x/%#X.
8316 *
8317 * Note that this is the same approach as used in
8318 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008319 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008320 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8321 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008322 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008323 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008324 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8325 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008326 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008327 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008328 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008329 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008330 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008331 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332}
8333
8334static int
8335formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008336 size_t buflen,
8337 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008339 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008340 if (PyUnicode_Check(v)) {
8341 if (PyUnicode_GET_SIZE(v) != 1)
8342 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008344 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008345
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008346 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008347 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008348 goto onError;
8349 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8350 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351
8352 else {
8353 /* Integer input truncated to a character */
8354 long x;
8355 x = PyInt_AsLong(v);
8356 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008357 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008358#ifdef Py_UNICODE_WIDE
8359 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008360 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008361 "%c arg not in range(0x110000) "
8362 "(wide Python build)");
8363 return -1;
8364 }
8365#else
8366 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008367 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008368 "%c arg not in range(0x10000) "
8369 "(narrow Python build)");
8370 return -1;
8371 }
8372#endif
8373 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374 }
8375 buf[1] = '\0';
8376 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008377
8378 onError:
8379 PyErr_SetString(PyExc_TypeError,
8380 "%c requires int or char");
8381 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382}
8383
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008384/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8385
8386 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8387 chars are formatted. XXX This is a magic number. Each formatting
8388 routine does bounds checking to ensure no overflow, but a better
8389 solution may be to malloc a buffer of appropriate size for each
8390 format. For now, the current solution is sufficient.
8391*/
8392#define FORMATBUFLEN (size_t)120
8393
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394PyObject *PyUnicode_Format(PyObject *format,
8395 PyObject *args)
8396{
8397 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008398 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399 int args_owned = 0;
8400 PyUnicodeObject *result = NULL;
8401 PyObject *dict = NULL;
8402 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008403
Guido van Rossumd57fd912000-03-10 22:53:23 +00008404 if (format == NULL || args == NULL) {
8405 PyErr_BadInternalCall();
8406 return NULL;
8407 }
8408 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008409 if (uformat == NULL)
8410 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008411 fmt = PyUnicode_AS_UNICODE(uformat);
8412 fmtcnt = PyUnicode_GET_SIZE(uformat);
8413
8414 reslen = rescnt = fmtcnt + 100;
8415 result = _PyUnicode_New(reslen);
8416 if (result == NULL)
8417 goto onError;
8418 res = PyUnicode_AS_UNICODE(result);
8419
8420 if (PyTuple_Check(args)) {
8421 arglen = PyTuple_Size(args);
8422 argidx = 0;
8423 }
8424 else {
8425 arglen = -1;
8426 argidx = -2;
8427 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008428 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008429 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430 dict = args;
8431
8432 while (--fmtcnt >= 0) {
8433 if (*fmt != '%') {
8434 if (--rescnt < 0) {
8435 rescnt = fmtcnt + 100;
8436 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008437 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008438 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008439 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8440 --rescnt;
8441 }
8442 *res++ = *fmt++;
8443 }
8444 else {
8445 /* Got a format specifier */
8446 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008447 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008448 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449 Py_UNICODE c = '\0';
8450 Py_UNICODE fill;
8451 PyObject *v = NULL;
8452 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008453 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008454 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008455 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008456 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457
8458 fmt++;
8459 if (*fmt == '(') {
8460 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008461 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462 PyObject *key;
8463 int pcount = 1;
8464
8465 if (dict == NULL) {
8466 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008467 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468 goto onError;
8469 }
8470 ++fmt;
8471 --fmtcnt;
8472 keystart = fmt;
8473 /* Skip over balanced parentheses */
8474 while (pcount > 0 && --fmtcnt >= 0) {
8475 if (*fmt == ')')
8476 --pcount;
8477 else if (*fmt == '(')
8478 ++pcount;
8479 fmt++;
8480 }
8481 keylen = fmt - keystart - 1;
8482 if (fmtcnt < 0 || pcount > 0) {
8483 PyErr_SetString(PyExc_ValueError,
8484 "incomplete format key");
8485 goto onError;
8486 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008487#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008488 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489 then looked up since Python uses strings to hold
8490 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008491 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492 key = PyUnicode_EncodeUTF8(keystart,
8493 keylen,
8494 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008495#else
8496 key = PyUnicode_FromUnicode(keystart, keylen);
8497#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498 if (key == NULL)
8499 goto onError;
8500 if (args_owned) {
8501 Py_DECREF(args);
8502 args_owned = 0;
8503 }
8504 args = PyObject_GetItem(dict, key);
8505 Py_DECREF(key);
8506 if (args == NULL) {
8507 goto onError;
8508 }
8509 args_owned = 1;
8510 arglen = -1;
8511 argidx = -2;
8512 }
8513 while (--fmtcnt >= 0) {
8514 switch (c = *fmt++) {
8515 case '-': flags |= F_LJUST; continue;
8516 case '+': flags |= F_SIGN; continue;
8517 case ' ': flags |= F_BLANK; continue;
8518 case '#': flags |= F_ALT; continue;
8519 case '0': flags |= F_ZERO; continue;
8520 }
8521 break;
8522 }
8523 if (c == '*') {
8524 v = getnextarg(args, arglen, &argidx);
8525 if (v == NULL)
8526 goto onError;
8527 if (!PyInt_Check(v)) {
8528 PyErr_SetString(PyExc_TypeError,
8529 "* wants int");
8530 goto onError;
8531 }
8532 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008533 if (width == -1 && PyErr_Occurred())
8534 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535 if (width < 0) {
8536 flags |= F_LJUST;
8537 width = -width;
8538 }
8539 if (--fmtcnt >= 0)
8540 c = *fmt++;
8541 }
8542 else if (c >= '0' && c <= '9') {
8543 width = c - '0';
8544 while (--fmtcnt >= 0) {
8545 c = *fmt++;
8546 if (c < '0' || c > '9')
8547 break;
8548 if ((width*10) / 10 != width) {
8549 PyErr_SetString(PyExc_ValueError,
8550 "width too big");
8551 goto onError;
8552 }
8553 width = width*10 + (c - '0');
8554 }
8555 }
8556 if (c == '.') {
8557 prec = 0;
8558 if (--fmtcnt >= 0)
8559 c = *fmt++;
8560 if (c == '*') {
8561 v = getnextarg(args, arglen, &argidx);
8562 if (v == NULL)
8563 goto onError;
8564 if (!PyInt_Check(v)) {
8565 PyErr_SetString(PyExc_TypeError,
8566 "* wants int");
8567 goto onError;
8568 }
8569 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008570 if (prec == -1 && PyErr_Occurred())
8571 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572 if (prec < 0)
8573 prec = 0;
8574 if (--fmtcnt >= 0)
8575 c = *fmt++;
8576 }
8577 else if (c >= '0' && c <= '9') {
8578 prec = c - '0';
8579 while (--fmtcnt >= 0) {
8580 c = Py_CHARMASK(*fmt++);
8581 if (c < '0' || c > '9')
8582 break;
8583 if ((prec*10) / 10 != prec) {
8584 PyErr_SetString(PyExc_ValueError,
8585 "prec too big");
8586 goto onError;
8587 }
8588 prec = prec*10 + (c - '0');
8589 }
8590 }
8591 } /* prec */
8592 if (fmtcnt >= 0) {
8593 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594 if (--fmtcnt >= 0)
8595 c = *fmt++;
8596 }
8597 }
8598 if (fmtcnt < 0) {
8599 PyErr_SetString(PyExc_ValueError,
8600 "incomplete format");
8601 goto onError;
8602 }
8603 if (c != '%') {
8604 v = getnextarg(args, arglen, &argidx);
8605 if (v == NULL)
8606 goto onError;
8607 }
8608 sign = 0;
8609 fill = ' ';
8610 switch (c) {
8611
8612 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008613 pbuf = formatbuf;
8614 /* presume that buffer length is at least 1 */
8615 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616 len = 1;
8617 break;
8618
8619 case 's':
8620 case 'r':
8621 if (PyUnicode_Check(v) && c == 's') {
8622 temp = v;
8623 Py_INCREF(temp);
8624 }
8625 else {
8626 PyObject *unicode;
8627 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008628 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629 else
8630 temp = PyObject_Repr(v);
8631 if (temp == NULL)
8632 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008633 if (PyUnicode_Check(temp))
8634 /* nothing to do */;
8635 else if (PyString_Check(temp)) {
8636 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008637 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008639 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008641 Py_DECREF(temp);
8642 temp = unicode;
8643 if (temp == NULL)
8644 goto onError;
8645 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008646 else {
8647 Py_DECREF(temp);
8648 PyErr_SetString(PyExc_TypeError,
8649 "%s argument has non-string str()");
8650 goto onError;
8651 }
8652 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008653 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654 len = PyUnicode_GET_SIZE(temp);
8655 if (prec >= 0 && len > prec)
8656 len = prec;
8657 break;
8658
8659 case 'i':
8660 case 'd':
8661 case 'u':
8662 case 'o':
8663 case 'x':
8664 case 'X':
8665 if (c == 'i')
8666 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008667 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008668 temp = formatlong(v, flags, prec, c);
8669 if (!temp)
8670 goto onError;
8671 pbuf = PyUnicode_AS_UNICODE(temp);
8672 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008673 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008675 else {
8676 pbuf = formatbuf;
8677 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8678 flags, prec, c, v);
8679 if (len < 0)
8680 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008681 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008682 }
8683 if (flags & F_ZERO)
8684 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685 break;
8686
8687 case 'e':
8688 case 'E':
8689 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008690 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008691 case 'g':
8692 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008693 if (c == 'F')
8694 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008695 pbuf = formatbuf;
8696 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8697 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698 if (len < 0)
8699 goto onError;
8700 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008701 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702 fill = '0';
8703 break;
8704
8705 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008706 pbuf = formatbuf;
8707 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708 if (len < 0)
8709 goto onError;
8710 break;
8711
8712 default:
8713 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008714 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008715 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008716 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008717 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008718 (Py_ssize_t)(fmt - 1 -
8719 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720 goto onError;
8721 }
8722 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008723 if (*pbuf == '-' || *pbuf == '+') {
8724 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725 len--;
8726 }
8727 else if (flags & F_SIGN)
8728 sign = '+';
8729 else if (flags & F_BLANK)
8730 sign = ' ';
8731 else
8732 sign = 0;
8733 }
8734 if (width < len)
8735 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008736 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737 reslen -= rescnt;
8738 rescnt = width + fmtcnt + 100;
8739 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008740 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008741 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008742 PyErr_NoMemory();
8743 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008744 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008745 if (_PyUnicode_Resize(&result, reslen) < 0) {
8746 Py_XDECREF(temp);
8747 goto onError;
8748 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749 res = PyUnicode_AS_UNICODE(result)
8750 + reslen - rescnt;
8751 }
8752 if (sign) {
8753 if (fill != ' ')
8754 *res++ = sign;
8755 rescnt--;
8756 if (width > len)
8757 width--;
8758 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008759 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008760 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008761 assert(pbuf[1] == c);
8762 if (fill != ' ') {
8763 *res++ = *pbuf++;
8764 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008765 }
Tim Petersfff53252001-04-12 18:38:48 +00008766 rescnt -= 2;
8767 width -= 2;
8768 if (width < 0)
8769 width = 0;
8770 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008771 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772 if (width > len && !(flags & F_LJUST)) {
8773 do {
8774 --rescnt;
8775 *res++ = fill;
8776 } while (--width > len);
8777 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008778 if (fill == ' ') {
8779 if (sign)
8780 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008781 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008782 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008783 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008784 *res++ = *pbuf++;
8785 *res++ = *pbuf++;
8786 }
8787 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008788 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789 res += len;
8790 rescnt -= len;
8791 while (--width >= len) {
8792 --rescnt;
8793 *res++ = ' ';
8794 }
8795 if (dict && (argidx < arglen) && c != '%') {
8796 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008797 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008798 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799 goto onError;
8800 }
8801 Py_XDECREF(temp);
8802 } /* '%' */
8803 } /* until end */
8804 if (argidx < arglen && !dict) {
8805 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008806 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807 goto onError;
8808 }
8809
Thomas Woutersa96affe2006-03-12 00:29:36 +00008810 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8811 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008812 if (args_owned) {
8813 Py_DECREF(args);
8814 }
8815 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816 return (PyObject *)result;
8817
8818 onError:
8819 Py_XDECREF(result);
8820 Py_DECREF(uformat);
8821 if (args_owned) {
8822 Py_DECREF(args);
8823 }
8824 return NULL;
8825}
8826
8827static PyBufferProcs unicode_as_buffer = {
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008828 (getbufferproc) unicode_buffer_getbuffer,
8829 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008830};
8831
Jeremy Hylton938ace62002-07-17 16:30:39 +00008832static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008833unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8834
Tim Peters6d6c1a32001-08-02 04:15:00 +00008835static PyObject *
8836unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8837{
8838 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008839 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008840 char *encoding = NULL;
8841 char *errors = NULL;
8842
Guido van Rossume023fe02001-08-30 03:12:59 +00008843 if (type != &PyUnicode_Type)
8844 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008845 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8846 kwlist, &x, &encoding, &errors))
8847 return NULL;
8848 if (x == NULL)
8849 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008850 if (encoding == NULL && errors == NULL)
8851 return PyObject_Unicode(x);
8852 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008853 return PyUnicode_FromEncodedObject(x, encoding, errors);
8854}
8855
Guido van Rossume023fe02001-08-30 03:12:59 +00008856static PyObject *
8857unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8858{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008859 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008860 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008861
8862 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8863 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8864 if (tmp == NULL)
8865 return NULL;
8866 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008867 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008868 if (pnew == NULL) {
8869 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008870 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008871 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008872 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8873 if (pnew->str == NULL) {
8874 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008875 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008876 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008877 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008878 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008879 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8880 pnew->length = n;
8881 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008882 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008883 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008884}
8885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008886PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008887"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008888\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008889Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008890encoding defaults to the current default string encoding.\n\
8891errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008892
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008893static PyObject *unicode_iter(PyObject *seq);
8894
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008896 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008897 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008898 sizeof(PyUnicodeObject), /* tp_size */
8899 0, /* tp_itemsize */
8900 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008901 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008903 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008904 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008905 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008906 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008907 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008908 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008909 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008910 (hashfunc) unicode_hash, /* tp_hash*/
8911 0, /* tp_call*/
8912 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008913 PyObject_GenericGetAttr, /* tp_getattro */
8914 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008915 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008916 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8917 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008918 unicode_doc, /* tp_doc */
8919 0, /* tp_traverse */
8920 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008921 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008922 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008923 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008924 0, /* tp_iternext */
8925 unicode_methods, /* tp_methods */
8926 0, /* tp_members */
8927 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008928 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008929 0, /* tp_dict */
8930 0, /* tp_descr_get */
8931 0, /* tp_descr_set */
8932 0, /* tp_dictoffset */
8933 0, /* tp_init */
8934 0, /* tp_alloc */
8935 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008936 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937};
8938
8939/* Initialize the Unicode implementation */
8940
Thomas Wouters78890102000-07-22 19:25:51 +00008941void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008943 int i;
8944
Thomas Wouters477c8d52006-05-27 19:21:47 +00008945 /* XXX - move this array to unicodectype.c ? */
8946 Py_UNICODE linebreak[] = {
8947 0x000A, /* LINE FEED */
8948 0x000D, /* CARRIAGE RETURN */
8949 0x001C, /* FILE SEPARATOR */
8950 0x001D, /* GROUP SEPARATOR */
8951 0x001E, /* RECORD SEPARATOR */
8952 0x0085, /* NEXT LINE */
8953 0x2028, /* LINE SEPARATOR */
8954 0x2029, /* PARAGRAPH SEPARATOR */
8955 };
8956
Fred Drakee4315f52000-05-09 19:53:39 +00008957 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008958 unicode_freelist = NULL;
8959 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008961 if (!unicode_empty)
8962 return;
8963
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008964 for (i = 0; i < 256; i++)
8965 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008966 if (PyType_Ready(&PyUnicode_Type) < 0)
8967 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008968
8969 /* initialize the linebreak bloom filter */
8970 bloom_linebreak = make_bloom_mask(
8971 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8972 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008973
8974 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975}
8976
8977/* Finalize the Unicode implementation */
8978
8979void
Thomas Wouters78890102000-07-22 19:25:51 +00008980_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008982 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008983 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008985 Py_XDECREF(unicode_empty);
8986 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008987
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008988 for (i = 0; i < 256; i++) {
8989 if (unicode_latin1[i]) {
8990 Py_DECREF(unicode_latin1[i]);
8991 unicode_latin1[i] = NULL;
8992 }
8993 }
8994
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008995 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996 PyUnicodeObject *v = u;
8997 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008998 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008999 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00009000 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009001 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009003 unicode_freelist = NULL;
9004 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009006
Walter Dörwald16807132007-05-25 13:52:07 +00009007void
9008PyUnicode_InternInPlace(PyObject **p)
9009{
9010 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9011 PyObject *t;
9012 if (s == NULL || !PyUnicode_Check(s))
9013 Py_FatalError(
9014 "PyUnicode_InternInPlace: unicode strings only please!");
9015 /* If it's a subclass, we don't really know what putting
9016 it in the interned dict might do. */
9017 if (!PyUnicode_CheckExact(s))
9018 return;
9019 if (PyUnicode_CHECK_INTERNED(s))
9020 return;
9021 if (interned == NULL) {
9022 interned = PyDict_New();
9023 if (interned == NULL) {
9024 PyErr_Clear(); /* Don't leave an exception */
9025 return;
9026 }
9027 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009028 /* It might be that the GetItem call fails even
9029 though the key is present in the dictionary,
9030 namely when this happens during a stack overflow. */
9031 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009032 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009033 Py_END_ALLOW_RECURSION
9034
Walter Dörwald16807132007-05-25 13:52:07 +00009035 if (t) {
9036 Py_INCREF(t);
9037 Py_DECREF(*p);
9038 *p = t;
9039 return;
9040 }
9041
Martin v. Löwis5b222132007-06-10 09:51:05 +00009042 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009043 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9044 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009045 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009046 return;
9047 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009048 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009049 /* The two references in interned are not counted by refcnt.
9050 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009051 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009052 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9053}
9054
9055void
9056PyUnicode_InternImmortal(PyObject **p)
9057{
9058 PyUnicode_InternInPlace(p);
9059 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9060 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9061 Py_INCREF(*p);
9062 }
9063}
9064
9065PyObject *
9066PyUnicode_InternFromString(const char *cp)
9067{
9068 PyObject *s = PyUnicode_FromString(cp);
9069 if (s == NULL)
9070 return NULL;
9071 PyUnicode_InternInPlace(&s);
9072 return s;
9073}
9074
9075void _Py_ReleaseInternedUnicodeStrings(void)
9076{
9077 PyObject *keys;
9078 PyUnicodeObject *s;
9079 Py_ssize_t i, n;
9080 Py_ssize_t immortal_size = 0, mortal_size = 0;
9081
9082 if (interned == NULL || !PyDict_Check(interned))
9083 return;
9084 keys = PyDict_Keys(interned);
9085 if (keys == NULL || !PyList_Check(keys)) {
9086 PyErr_Clear();
9087 return;
9088 }
9089
9090 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9091 detector, interned unicode strings are not forcibly deallocated;
9092 rather, we give them their stolen references back, and then clear
9093 and DECREF the interned dict. */
9094
9095 n = PyList_GET_SIZE(keys);
9096 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9097 n);
9098 for (i = 0; i < n; i++) {
9099 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9100 switch (s->state) {
9101 case SSTATE_NOT_INTERNED:
9102 /* XXX Shouldn't happen */
9103 break;
9104 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009105 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009106 immortal_size += s->length;
9107 break;
9108 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009109 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009110 mortal_size += s->length;
9111 break;
9112 default:
9113 Py_FatalError("Inconsistent interned string state.");
9114 }
9115 s->state = SSTATE_NOT_INTERNED;
9116 }
9117 fprintf(stderr, "total size of all interned strings: "
9118 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9119 "mortal/immortal\n", mortal_size, immortal_size);
9120 Py_DECREF(keys);
9121 PyDict_Clear(interned);
9122 Py_DECREF(interned);
9123 interned = NULL;
9124}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009125
9126
9127/********************* Unicode Iterator **************************/
9128
9129typedef struct {
9130 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009131 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009132 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9133} unicodeiterobject;
9134
9135static void
9136unicodeiter_dealloc(unicodeiterobject *it)
9137{
9138 _PyObject_GC_UNTRACK(it);
9139 Py_XDECREF(it->it_seq);
9140 PyObject_GC_Del(it);
9141}
9142
9143static int
9144unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9145{
9146 Py_VISIT(it->it_seq);
9147 return 0;
9148}
9149
9150static PyObject *
9151unicodeiter_next(unicodeiterobject *it)
9152{
9153 PyUnicodeObject *seq;
9154 PyObject *item;
9155
9156 assert(it != NULL);
9157 seq = it->it_seq;
9158 if (seq == NULL)
9159 return NULL;
9160 assert(PyUnicode_Check(seq));
9161
9162 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009163 item = PyUnicode_FromUnicode(
9164 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009165 if (item != NULL)
9166 ++it->it_index;
9167 return item;
9168 }
9169
9170 Py_DECREF(seq);
9171 it->it_seq = NULL;
9172 return NULL;
9173}
9174
9175static PyObject *
9176unicodeiter_len(unicodeiterobject *it)
9177{
9178 Py_ssize_t len = 0;
9179 if (it->it_seq)
9180 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9181 return PyInt_FromSsize_t(len);
9182}
9183
9184PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9185
9186static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009187 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9188 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009189 {NULL, NULL} /* sentinel */
9190};
9191
9192PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009193 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009194 "unicodeiterator", /* tp_name */
9195 sizeof(unicodeiterobject), /* tp_basicsize */
9196 0, /* tp_itemsize */
9197 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009198 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009199 0, /* tp_print */
9200 0, /* tp_getattr */
9201 0, /* tp_setattr */
9202 0, /* tp_compare */
9203 0, /* tp_repr */
9204 0, /* tp_as_number */
9205 0, /* tp_as_sequence */
9206 0, /* tp_as_mapping */
9207 0, /* tp_hash */
9208 0, /* tp_call */
9209 0, /* tp_str */
9210 PyObject_GenericGetAttr, /* tp_getattro */
9211 0, /* tp_setattro */
9212 0, /* tp_as_buffer */
9213 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9214 0, /* tp_doc */
9215 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9216 0, /* tp_clear */
9217 0, /* tp_richcompare */
9218 0, /* tp_weaklistoffset */
9219 PyObject_SelfIter, /* tp_iter */
9220 (iternextfunc)unicodeiter_next, /* tp_iternext */
9221 unicodeiter_methods, /* tp_methods */
9222 0,
9223};
9224
9225static PyObject *
9226unicode_iter(PyObject *seq)
9227{
9228 unicodeiterobject *it;
9229
9230 if (!PyUnicode_Check(seq)) {
9231 PyErr_BadInternalCall();
9232 return NULL;
9233 }
9234 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9235 if (it == NULL)
9236 return NULL;
9237 it->it_index = 0;
9238 Py_INCREF(seq);
9239 it->it_seq = (PyUnicodeObject *)seq;
9240 _PyObject_GC_TRACK(it);
9241 return (PyObject *)it;
9242}
9243
Martin v. Löwis5b222132007-06-10 09:51:05 +00009244size_t
9245Py_UNICODE_strlen(const Py_UNICODE *u)
9246{
9247 int res = 0;
9248 while(*u++)
9249 res++;
9250 return res;
9251}
9252
9253Py_UNICODE*
9254Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9255{
9256 Py_UNICODE *u = s1;
9257 while ((*u++ = *s2++));
9258 return s1;
9259}
9260
9261Py_UNICODE*
9262Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9263{
9264 Py_UNICODE *u = s1;
9265 while ((*u++ = *s2++))
9266 if (n-- == 0)
9267 break;
9268 return s1;
9269}
9270
9271int
9272Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9273{
9274 while (*s1 && *s2 && *s1 == *s2)
9275 s1++, s2++;
9276 if (*s1 && *s2)
9277 return (*s1 < *s2) ? -1 : +1;
9278 if (*s1)
9279 return 1;
9280 if (*s2)
9281 return -1;
9282 return 0;
9283}
9284
9285Py_UNICODE*
9286Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9287{
9288 const Py_UNICODE *p;
9289 for (p = s; *p; p++)
9290 if (*p == c)
9291 return (Py_UNICODE*)p;
9292 return NULL;
9293}
9294
9295
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009296#ifdef __cplusplus
9297}
9298#endif
9299
9300
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009301/*
9302Local variables:
9303c-basic-offset: 4
9304indent-tabs-mode: nil
9305End:
9306*/