blob: 98723dba81606b037d1b589eea7066b28f8ea3e7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Eric Smith8c663262007-08-25 02:26:07 +000048#include "formatter_unicode.h"
49
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Guido van Rossumd57fd912000-03-10 22:53:23 +000054/* Limit for the Unicode object free list */
55
56#define MAX_UNICODE_FREELIST_SIZE 1024
57
58/* Limit for the Unicode object free list stay alive optimization.
59
60 The implementation will keep allocated Unicode memory intact for
61 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000062 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Barry Warsaw51ac5802000-03-20 16:36:48 +000064 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000066 malloc()-overhead) bytes of unused garbage.
67
68 Setting the limit to 0 effectively turns the feature off.
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070 Note: This is an experimental feature ! If you get core dumps when
71 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73*/
74
Guido van Rossumfd4b9572000-04-10 13:51:10 +000075#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000076
77/* Endianness switches; defaults to little endian */
78
79#ifdef WORDS_BIGENDIAN
80# define BYTEORDER_IS_BIG_ENDIAN
81#else
82# define BYTEORDER_IS_LITTLE_ENDIAN
83#endif
84
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085/* --- Globals ------------------------------------------------------------
86
87 The globals are initialized by the _PyUnicode_Init() API and should
88 not be used before calling that API.
89
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Walter Dörwald16807132007-05-25 13:52:07 +000097/* This dictionary holds all interned unicode strings. Note that references
98 to strings in this dictionary are *not* counted in the string's ob_refcnt.
99 When the interned string reaches a refcnt of 0 the string deallocation
100 function will delete the reference from this dictionary.
101
102 Another way to look at this is that to say that the actual reference
103 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
104*/
105static PyObject *interned;
106
Guido van Rossumd57fd912000-03-10 22:53:23 +0000107/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000108static PyUnicodeObject *unicode_freelist;
109static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000110
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000111/* The empty Unicode object is shared to improve performance. */
112static PyUnicodeObject *unicode_empty;
113
114/* Single character Unicode strings in the Latin-1 range are being
115 shared as well. */
116static PyUnicodeObject *unicode_latin1[256];
117
Fred Drakee4315f52000-05-09 19:53:39 +0000118/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000119 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000120 PyUnicode_GetDefaultEncoding() API to access this global.
121
122 Don't forget to alter Py_FileSystemDefaultEncoding() if you change the
123 hard coded default!
124*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000125static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000126
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000127Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000128PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000129{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000130#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000131 return 0x10FFFF;
132#else
133 /* This is actually an illegal character, so it should
134 not be passed to unichr. */
135 return 0xFFFF;
136#endif
137}
138
Thomas Wouters477c8d52006-05-27 19:21:47 +0000139/* --- Bloom Filters ----------------------------------------------------- */
140
141/* stuff to implement simple "bloom filters" for Unicode characters.
142 to keep things simple, we use a single bitmask, using the least 5
143 bits from each unicode characters as the bit index. */
144
145/* the linebreak mask is set up by Unicode_Init below */
146
147#define BLOOM_MASK unsigned long
148
149static BLOOM_MASK bloom_linebreak;
150
151#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
152
153#define BLOOM_LINEBREAK(ch)\
154 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
155
156Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
157{
158 /* calculate simple bloom-style bitmask for a given unicode string */
159
160 long mask;
161 Py_ssize_t i;
162
163 mask = 0;
164 for (i = 0; i < len; i++)
165 mask |= (1 << (ptr[i] & 0x1F));
166
167 return mask;
168}
169
170Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
171{
172 Py_ssize_t i;
173
174 for (i = 0; i < setlen; i++)
175 if (set[i] == chr)
176 return 1;
177
178 return 0;
179}
180
181#define BLOOM_MEMBER(mask, chr, set, setlen)\
182 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
183
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184/* --- Unicode Object ----------------------------------------------------- */
185
186static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000188 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189{
190 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000191
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000192 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000194 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000195
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196 /* Resizing shared object (unicode_empty or single character
197 objects) in-place is not allowed. Use PyUnicode_Resize()
198 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000199
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000200 if (unicode == unicode_empty ||
201 (unicode->length == 1 &&
202 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 return -1;
207 }
208
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209 /* We allocate one more byte to make sure the string is Ux0000 terminated.
210 The overallocation is also used by fastsearch, which assumes that it's
211 safe to look at str[length] (without making any assumptions about what
212 it contains). */
213
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 oldstr = unicode->str;
215 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
216 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000217 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 PyErr_NoMemory();
219 return -1;
220 }
221 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000222 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000224 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000226 if (unicode->defenc) {
227 Py_DECREF(unicode->defenc);
228 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 }
230 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000231
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232 return 0;
233}
234
235/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000236 Ux0000 terminated; some code (e.g. new_identifier)
237 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238
239 XXX This allocator could further be enhanced by assuring that the
240 free list never reduces its size below 1.
241
242*/
243
244static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000245PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246{
247 register PyUnicodeObject *unicode;
248
Thomas Wouters477c8d52006-05-27 19:21:47 +0000249 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 if (length == 0 && unicode_empty != NULL) {
251 Py_INCREF(unicode_empty);
252 return unicode_empty;
253 }
254
255 /* Unicode freelist & memory allocation */
256 if (unicode_freelist) {
257 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000258 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000261 /* Keep-Alive optimization: we only upsize the buffer,
262 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000263 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000264 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000265 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000266 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 }
268 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000269 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000270 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000271 }
272 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 }
274 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000275 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 if (unicode == NULL)
277 return NULL;
278 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
279 }
280
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000281 if (!unicode->str) {
282 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000283 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000284 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000285 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000286 * the caller fails before initializing str -- unicode_resize()
287 * reads str[0], and the Keep-Alive optimization can keep memory
288 * allocated for str alive across a call to unicode_dealloc(unicode).
289 * We don't want unicode_resize to read uninitialized memory in
290 * that case.
291 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000292 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000294 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000295 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000296 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000297 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000299
300 onError:
301 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000302 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000303 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304}
305
306static
Guido van Rossum9475a232001-10-05 20:51:39 +0000307void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308{
Walter Dörwald16807132007-05-25 13:52:07 +0000309 switch (PyUnicode_CHECK_INTERNED(unicode)) {
310 case SSTATE_NOT_INTERNED:
311 break;
312
313 case SSTATE_INTERNED_MORTAL:
314 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000315 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000316 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
317 Py_FatalError(
318 "deletion of interned unicode string failed");
319 break;
320
321 case SSTATE_INTERNED_IMMORTAL:
322 Py_FatalError("Immortal interned unicode string died.");
323
324 default:
325 Py_FatalError("Inconsistent interned unicode string state.");
326 }
327
Guido van Rossum604ddf82001-12-06 20:03:56 +0000328 if (PyUnicode_CheckExact(unicode) &&
329 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000330 /* Keep-Alive optimization */
331 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000332 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 unicode->str = NULL;
334 unicode->length = 0;
335 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000336 if (unicode->defenc) {
337 Py_DECREF(unicode->defenc);
338 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000339 }
340 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000341 *(PyUnicodeObject **)unicode = unicode_freelist;
342 unicode_freelist = unicode;
343 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000344 }
345 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000346 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000347 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000348 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 }
350}
351
Martin v. Löwis18e16552006-02-15 17:27:45 +0000352int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000353{
354 register PyUnicodeObject *v;
355
356 /* Argument checks */
357 if (unicode == NULL) {
358 PyErr_BadInternalCall();
359 return -1;
360 }
361 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000362 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000363 PyErr_BadInternalCall();
364 return -1;
365 }
366
367 /* Resizing unicode_empty and single character objects is not
368 possible since these are being shared. We simply return a fresh
369 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000370 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000371 (v == unicode_empty || v->length == 1)) {
372 PyUnicodeObject *w = _PyUnicode_New(length);
373 if (w == NULL)
374 return -1;
375 Py_UNICODE_COPY(w->str, v->str,
376 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000377 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000378 *unicode = (PyObject *)w;
379 return 0;
380 }
381
382 /* Note that we don't have to modify *unicode for unshared Unicode
383 objects, since we can modify them in-place. */
384 return unicode_resize(v, length);
385}
386
387/* Internal API for use in unicodeobject.c only ! */
388#define _PyUnicode_Resize(unicodevar, length) \
389 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
390
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000392 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393{
394 PyUnicodeObject *unicode;
395
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000396 /* If the Unicode data is known at construction time, we can apply
397 some optimizations which share commonly used objects. */
398 if (u != NULL) {
399
400 /* Optimization for empty strings */
401 if (size == 0 && unicode_empty != NULL) {
402 Py_INCREF(unicode_empty);
403 return (PyObject *)unicode_empty;
404 }
405
406 /* Single character Unicode objects in the Latin-1 range are
407 shared when using this constructor */
408 if (size == 1 && *u < 256) {
409 unicode = unicode_latin1[*u];
410 if (!unicode) {
411 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000412 if (!unicode)
413 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000414 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000415 unicode_latin1[*u] = unicode;
416 }
417 Py_INCREF(unicode);
418 return (PyObject *)unicode;
419 }
420 }
Tim Petersced69f82003-09-16 20:30:58 +0000421
Guido van Rossumd57fd912000-03-10 22:53:23 +0000422 unicode = _PyUnicode_New(size);
423 if (!unicode)
424 return NULL;
425
426 /* Copy the Unicode data into the new object */
427 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000428 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000429
430 return (PyObject *)unicode;
431}
432
Walter Dörwaldd2034312007-05-18 16:29:38 +0000433PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000434{
435 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000436 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000437 some optimizations which share commonly used objects.
438 Also, this means the input must be UTF-8, so fall back to the
439 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000440 if (u != NULL) {
441
442 /* Optimization for empty strings */
443 if (size == 0 && unicode_empty != NULL) {
444 Py_INCREF(unicode_empty);
445 return (PyObject *)unicode_empty;
446 }
447
Martin v. Löwis9c121062007-08-05 20:26:11 +0000448 /* Single characters are shared when using this constructor.
449 Restrict to ASCII, since the input must be UTF-8. */
450 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000451 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000452 if (!unicode) {
453 unicode = _PyUnicode_New(1);
454 if (!unicode)
455 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000456 unicode->str[0] = Py_CHARMASK(*u);
457 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000458 }
459 Py_INCREF(unicode);
460 return (PyObject *)unicode;
461 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000462
463 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000464 }
465
Walter Dörwald55507312007-05-18 13:12:10 +0000466 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000467 if (!unicode)
468 return NULL;
469
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000470 return (PyObject *)unicode;
471}
472
Walter Dörwaldd2034312007-05-18 16:29:38 +0000473PyObject *PyUnicode_FromString(const char *u)
474{
475 size_t size = strlen(u);
476 if (size > PY_SSIZE_T_MAX) {
477 PyErr_SetString(PyExc_OverflowError, "input too long");
478 return NULL;
479 }
480
481 return PyUnicode_FromStringAndSize(u, size);
482}
483
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484#ifdef HAVE_WCHAR_H
485
486PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000487 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000488{
489 PyUnicodeObject *unicode;
490
491 if (w == NULL) {
492 PyErr_BadInternalCall();
493 return NULL;
494 }
495
496 unicode = _PyUnicode_New(size);
497 if (!unicode)
498 return NULL;
499
500 /* Copy the wchar_t data into the new object */
501#ifdef HAVE_USABLE_WCHAR_T
502 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000503#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 {
505 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000506 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000507 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000508 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 *u++ = *w++;
510 }
511#endif
512
513 return (PyObject *)unicode;
514}
515
Walter Dörwald346737f2007-05-31 10:44:43 +0000516static void
517makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
518{
519 *fmt++ = '%';
520 if (width) {
521 if (zeropad)
522 *fmt++ = '0';
523 fmt += sprintf(fmt, "%d", width);
524 }
525 if (precision)
526 fmt += sprintf(fmt, ".%d", precision);
527 if (longflag)
528 *fmt++ = 'l';
529 else if (size_tflag) {
530 char *f = PY_FORMAT_SIZE_T;
531 while (*f)
532 *fmt++ = *f++;
533 }
534 *fmt++ = c;
535 *fmt = '\0';
536}
537
Walter Dörwaldd2034312007-05-18 16:29:38 +0000538#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
539
540PyObject *
541PyUnicode_FromFormatV(const char *format, va_list vargs)
542{
543 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000544 Py_ssize_t callcount = 0;
545 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000546 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000547 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000548 int width = 0;
549 int precision = 0;
550 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000551 const char* f;
552 Py_UNICODE *s;
553 PyObject *string;
554 /* used by sprintf */
555 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000556 /* use abuffer instead of buffer, if we need more space
557 * (which can happen if there's a format specifier with width). */
558 char *abuffer = NULL;
559 char *realbuffer;
560 Py_ssize_t abuffersize = 0;
561 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000562 const char *copy;
563
564#ifdef VA_LIST_IS_ARRAY
565 Py_MEMCPY(count, vargs, sizeof(va_list));
566#else
567#ifdef __va_copy
568 __va_copy(count, vargs);
569#else
570 count = vargs;
571#endif
572#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000573 /* step 1: count the number of %S/%R format specifications
574 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
575 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000576 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000577 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000578 ++callcount;
579 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000580 /* step 2: allocate memory for the results of
581 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000582 if (callcount) {
583 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
584 if (!callresults) {
585 PyErr_NoMemory();
586 return NULL;
587 }
588 callresult = callresults;
589 }
590 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000591 for (f = format; *f; f++) {
592 if (*f == '%') {
593 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000594 width = 0;
595 while (isdigit(Py_CHARMASK(*f)))
596 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000597 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
598 ;
599
600 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
601 * they don't affect the amount of space we reserve.
602 */
603 if ((*f == 'l' || *f == 'z') &&
604 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000605 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000606
607 switch (*f) {
608 case 'c':
609 (void)va_arg(count, int);
610 /* fall through... */
611 case '%':
612 n++;
613 break;
614 case 'd': case 'u': case 'i': case 'x':
615 (void) va_arg(count, int);
616 /* 20 bytes is enough to hold a 64-bit
617 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000618 This isn't enough for octal.
619 If a width is specified we need more
620 (which we allocate later). */
621 if (width < 20)
622 width = 20;
623 n += width;
624 if (abuffersize < width)
625 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000626 break;
627 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000628 {
629 /* UTF-8 */
630 unsigned char*s;
631 s = va_arg(count, unsigned char*);
632 while (*s) {
633 if (*s < 128) {
634 n++; s++;
635 } else if (*s < 0xc0) {
636 /* invalid UTF-8 */
637 n++; s++;
638 } else if (*s < 0xc0) {
639 n++;
640 s++; if(!*s)break;
641 s++;
642 } else if (*s < 0xe0) {
643 n++;
644 s++; if(!*s)break;
645 s++; if(!*s)break;
646 s++;
647 } else {
648 #ifdef Py_UNICODE_WIDE
649 n++;
650 #else
651 n+=2;
652 #endif
653 s++; if(!*s)break;
654 s++; if(!*s)break;
655 s++; if(!*s)break;
656 s++;
657 }
658 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000659 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000660 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000661 case 'U':
662 {
663 PyObject *obj = va_arg(count, PyObject *);
664 assert(obj && PyUnicode_Check(obj));
665 n += PyUnicode_GET_SIZE(obj);
666 break;
667 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000668 case 'V':
669 {
670 PyObject *obj = va_arg(count, PyObject *);
671 const char *str = va_arg(count, const char *);
672 assert(obj || str);
673 assert(!obj || PyUnicode_Check(obj));
674 if (obj)
675 n += PyUnicode_GET_SIZE(obj);
676 else
677 n += strlen(str);
678 break;
679 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000680 case 'S':
681 {
682 PyObject *obj = va_arg(count, PyObject *);
683 PyObject *str;
684 assert(obj);
685 str = PyObject_Unicode(obj);
686 if (!str)
687 goto fail;
688 n += PyUnicode_GET_SIZE(str);
689 /* Remember the str and switch to the next slot */
690 *callresult++ = str;
691 break;
692 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000693 case 'R':
694 {
695 PyObject *obj = va_arg(count, PyObject *);
696 PyObject *repr;
697 assert(obj);
698 repr = PyObject_Repr(obj);
699 if (!repr)
700 goto fail;
701 n += PyUnicode_GET_SIZE(repr);
702 /* Remember the repr and switch to the next slot */
703 *callresult++ = repr;
704 break;
705 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000706 case 'p':
707 (void) va_arg(count, int);
708 /* maximum 64-bit pointer representation:
709 * 0xffffffffffffffff
710 * so 19 characters is enough.
711 * XXX I count 18 -- what's the extra for?
712 */
713 n += 19;
714 break;
715 default:
716 /* if we stumble upon an unknown
717 formatting code, copy the rest of
718 the format string to the output
719 string. (we cannot just skip the
720 code, since there's no way to know
721 what's in the argument list) */
722 n += strlen(p);
723 goto expand;
724 }
725 } else
726 n++;
727 }
728 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000729 if (abuffersize > 20) {
730 abuffer = PyMem_Malloc(abuffersize);
731 if (!abuffer) {
732 PyErr_NoMemory();
733 goto fail;
734 }
735 realbuffer = abuffer;
736 }
737 else
738 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000739 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000740 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000741 we don't have to resize the string.
742 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000743 string = PyUnicode_FromUnicode(NULL, n);
744 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000745 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000746
747 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000748 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000749
750 for (f = format; *f; f++) {
751 if (*f == '%') {
752 const char* p = f++;
753 int longflag = 0;
754 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000755 zeropad = (*f == '0');
756 /* parse the width.precision part */
757 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000758 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000759 width = (width*10) + *f++ - '0';
760 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000761 if (*f == '.') {
762 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000764 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000765 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000766 /* handle the long flag, but only for %ld and %lu.
767 others can be added when necessary. */
768 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
769 longflag = 1;
770 ++f;
771 }
772 /* handle the size_t flag. */
773 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
774 size_tflag = 1;
775 ++f;
776 }
777
778 switch (*f) {
779 case 'c':
780 *s++ = va_arg(vargs, int);
781 break;
782 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000783 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000784 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000785 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000786 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000787 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000788 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000789 sprintf(realbuffer, fmt, va_arg(vargs, int));
790 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000791 break;
792 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000793 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000795 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000796 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000797 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000798 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000799 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
800 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000801 break;
802 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000803 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
804 sprintf(realbuffer, fmt, va_arg(vargs, int));
805 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000806 break;
807 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000808 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
809 sprintf(realbuffer, fmt, va_arg(vargs, int));
810 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000811 break;
812 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000813 {
814 /* Parameter must be UTF-8 encoded.
815 In case of encoding errors, use
816 the replacement character. */
817 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000818 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000819 u = PyUnicode_DecodeUTF8(p, strlen(p),
820 "replace");
821 if (!u)
822 goto fail;
823 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
824 PyUnicode_GET_SIZE(u));
825 s += PyUnicode_GET_SIZE(u);
826 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000827 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000828 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000829 case 'U':
830 {
831 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000832 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
833 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
834 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000835 break;
836 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000837 case 'V':
838 {
839 PyObject *obj = va_arg(vargs, PyObject *);
840 const char *str = va_arg(vargs, const char *);
841 if (obj) {
842 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
843 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
844 s += size;
845 } else {
846 appendstring(str);
847 }
848 break;
849 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000850 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000851 case 'R':
852 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000853 Py_UNICODE *ucopy;
854 Py_ssize_t usize;
855 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000856 /* unused, since we already have the result */
857 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000858 ucopy = PyUnicode_AS_UNICODE(*callresult);
859 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000860 for (upos = 0; upos<usize;)
861 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000862 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000863 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000864 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000865 ++callresult;
866 break;
867 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000868 case 'p':
869 sprintf(buffer, "%p", va_arg(vargs, void*));
870 /* %p is ill-defined: ensure leading 0x. */
871 if (buffer[1] == 'X')
872 buffer[1] = 'x';
873 else if (buffer[1] != 'x') {
874 memmove(buffer+2, buffer, strlen(buffer)+1);
875 buffer[0] = '0';
876 buffer[1] = 'x';
877 }
878 appendstring(buffer);
879 break;
880 case '%':
881 *s++ = '%';
882 break;
883 default:
884 appendstring(p);
885 goto end;
886 }
887 } else
888 *s++ = *f;
889 }
890
891 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000892 if (callresults)
893 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000894 if (abuffer)
895 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000896 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
897 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000898 fail:
899 if (callresults) {
900 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000901 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000902 Py_DECREF(*callresult2);
903 ++callresult2;
904 }
905 PyMem_Free(callresults);
906 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000907 if (abuffer)
908 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000909 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000910}
911
912#undef appendstring
913
914PyObject *
915PyUnicode_FromFormat(const char *format, ...)
916{
917 PyObject* ret;
918 va_list vargs;
919
920#ifdef HAVE_STDARG_PROTOTYPES
921 va_start(vargs, format);
922#else
923 va_start(vargs);
924#endif
925 ret = PyUnicode_FromFormatV(format, vargs);
926 va_end(vargs);
927 return ret;
928}
929
Martin v. Löwis18e16552006-02-15 17:27:45 +0000930Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
931 wchar_t *w,
932 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000933{
934 if (unicode == NULL) {
935 PyErr_BadInternalCall();
936 return -1;
937 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000938
939 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000940 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000941 size = PyUnicode_GET_SIZE(unicode) + 1;
942
Guido van Rossumd57fd912000-03-10 22:53:23 +0000943#ifdef HAVE_USABLE_WCHAR_T
944 memcpy(w, unicode->str, size * sizeof(wchar_t));
945#else
946 {
947 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000948 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000949 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000950 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000951 *w++ = *u++;
952 }
953#endif
954
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000955 if (size > PyUnicode_GET_SIZE(unicode))
956 return PyUnicode_GET_SIZE(unicode);
957 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000958 return size;
959}
960
961#endif
962
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000963PyObject *PyUnicode_FromOrdinal(int ordinal)
964{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000965 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000966
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000967 if (ordinal < 0 || ordinal > 0x10ffff) {
968 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000969 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000970 return NULL;
971 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000972
973#ifndef Py_UNICODE_WIDE
974 if (ordinal > 0xffff) {
975 ordinal -= 0x10000;
976 s[0] = 0xD800 | (ordinal >> 10);
977 s[1] = 0xDC00 | (ordinal & 0x3FF);
978 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000979 }
980#endif
981
Hye-Shik Chang40574832004-04-06 07:24:51 +0000982 s[0] = (Py_UNICODE)ordinal;
983 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000984}
985
Guido van Rossumd57fd912000-03-10 22:53:23 +0000986PyObject *PyUnicode_FromObject(register PyObject *obj)
987{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000988 /* XXX Perhaps we should make this API an alias of
989 PyObject_Unicode() instead ?! */
990 if (PyUnicode_CheckExact(obj)) {
991 Py_INCREF(obj);
992 return obj;
993 }
994 if (PyUnicode_Check(obj)) {
995 /* For a Unicode subtype that's not a Unicode object,
996 return a true Unicode object with the same data. */
997 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
998 PyUnicode_GET_SIZE(obj));
999 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001000 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1001}
1002
1003PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1004 const char *encoding,
1005 const char *errors)
1006{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001007 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001008 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001009 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001010
Guido van Rossumd57fd912000-03-10 22:53:23 +00001011 if (obj == NULL) {
1012 PyErr_BadInternalCall();
1013 return NULL;
1014 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001015
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001016 if (PyUnicode_Check(obj)) {
1017 PyErr_SetString(PyExc_TypeError,
1018 "decoding Unicode is not supported");
1019 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001020 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001021
1022 /* Coerce object */
1023 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001024 s = PyString_AS_STRING(obj);
1025 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001026 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001027 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1028 /* Overwrite the error message with something more useful in
1029 case of a TypeError. */
1030 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001031 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001032 "coercing to Unicode: need string or buffer, "
1033 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001034 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001035 goto onError;
1036 }
Tim Petersced69f82003-09-16 20:30:58 +00001037
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001038 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039 if (len == 0) {
1040 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001041 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042 }
Tim Petersced69f82003-09-16 20:30:58 +00001043 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001044 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001045
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001046 return v;
1047
1048 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001049 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001050}
1051
1052PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001053 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001054 const char *encoding,
1055 const char *errors)
1056{
1057 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001058 Py_buffer info;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001059
1060 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001061 encoding = PyUnicode_GetDefaultEncoding();
1062
1063 /* Shortcuts for common default encodings */
1064 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001065 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001066 else if (strcmp(encoding, "latin-1") == 0)
1067 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001068#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1069 else if (strcmp(encoding, "mbcs") == 0)
1070 return PyUnicode_DecodeMBCS(s, size, errors);
1071#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001072 else if (strcmp(encoding, "ascii") == 0)
1073 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074
1075 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001076 buffer = NULL;
1077 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1078 goto onError;
1079 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001080 if (buffer == NULL)
1081 goto onError;
1082 unicode = PyCodec_Decode(buffer, encoding, errors);
1083 if (unicode == NULL)
1084 goto onError;
1085 if (!PyUnicode_Check(unicode)) {
1086 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001087 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001088 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001089 Py_DECREF(unicode);
1090 goto onError;
1091 }
1092 Py_DECREF(buffer);
1093 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001094
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 onError:
1096 Py_XDECREF(buffer);
1097 return NULL;
1098}
1099
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001100PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1101 const char *encoding,
1102 const char *errors)
1103{
1104 PyObject *v;
1105
1106 if (!PyUnicode_Check(unicode)) {
1107 PyErr_BadArgument();
1108 goto onError;
1109 }
1110
1111 if (encoding == NULL)
1112 encoding = PyUnicode_GetDefaultEncoding();
1113
1114 /* Decode via the codec registry */
1115 v = PyCodec_Decode(unicode, encoding, errors);
1116 if (v == NULL)
1117 goto onError;
1118 return v;
1119
1120 onError:
1121 return NULL;
1122}
1123
Guido van Rossumd57fd912000-03-10 22:53:23 +00001124PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001125 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001126 const char *encoding,
1127 const char *errors)
1128{
1129 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001130
Guido van Rossumd57fd912000-03-10 22:53:23 +00001131 unicode = PyUnicode_FromUnicode(s, size);
1132 if (unicode == NULL)
1133 return NULL;
1134 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1135 Py_DECREF(unicode);
1136 return v;
1137}
1138
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001139PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1140 const char *encoding,
1141 const char *errors)
1142{
1143 PyObject *v;
1144
1145 if (!PyUnicode_Check(unicode)) {
1146 PyErr_BadArgument();
1147 goto onError;
1148 }
1149
1150 if (encoding == NULL)
1151 encoding = PyUnicode_GetDefaultEncoding();
1152
1153 /* Encode via the codec registry */
1154 v = PyCodec_Encode(unicode, encoding, errors);
1155 if (v == NULL)
1156 goto onError;
1157 return v;
1158
1159 onError:
1160 return NULL;
1161}
1162
Guido van Rossumd57fd912000-03-10 22:53:23 +00001163PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1164 const char *encoding,
1165 const char *errors)
1166{
1167 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001168
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 if (!PyUnicode_Check(unicode)) {
1170 PyErr_BadArgument();
1171 goto onError;
1172 }
Fred Drakee4315f52000-05-09 19:53:39 +00001173
Tim Petersced69f82003-09-16 20:30:58 +00001174 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001175 encoding = PyUnicode_GetDefaultEncoding();
1176
1177 /* Shortcuts for common default encodings */
1178 if (errors == NULL) {
1179 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001180 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001181 else if (strcmp(encoding, "latin-1") == 0)
1182 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001183#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1184 else if (strcmp(encoding, "mbcs") == 0)
1185 return PyUnicode_AsMBCSString(unicode);
1186#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001187 else if (strcmp(encoding, "ascii") == 0)
1188 return PyUnicode_AsASCIIString(unicode);
1189 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190
1191 /* Encode via the codec registry */
1192 v = PyCodec_Encode(unicode, encoding, errors);
1193 if (v == NULL)
1194 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001195 if (!PyBytes_Check(v)) {
1196 if (PyString_Check(v)) {
1197 /* Old codec, turn it into bytes */
1198 PyObject *b = PyBytes_FromObject(v);
1199 Py_DECREF(v);
1200 return b;
1201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001203 "encoder did not return a bytes object "
1204 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1205 v->ob_type->tp_name,
1206 encoding ? encoding : "NULL",
1207 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208 Py_DECREF(v);
1209 goto onError;
1210 }
1211 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001212
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 onError:
1214 return NULL;
1215}
1216
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001217PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1218 const char *errors)
1219{
1220 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001221 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001222 if (v)
1223 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001224 if (errors != NULL)
1225 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum06610092007-08-16 21:02:22 +00001226 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1227 PyUnicode_GET_SIZE(unicode),
1228 NULL);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001229 if (!b)
1230 return NULL;
1231 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1232 PyBytes_Size(b));
1233 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001234 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001235 return v;
1236}
1237
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001238PyObject*
1239PyUnicode_DecodeFSDefault(const char *s)
1240{
1241 Py_ssize_t size = (Py_ssize_t)strlen(s);
1242
1243 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1244 can be undefined. If it is case, decode using UTF-8. The following assumes
1245 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1246 bootstrapping process where the codecs aren't ready yet.
1247 */
1248 if (Py_FileSystemDefaultEncoding) {
1249#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1250 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs")) {
1251 return PyUnicode_DecodeMBCS(s, size, "replace");
1252 }
1253#elif defined(__APPLE__)
1254 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8")) {
1255 return PyUnicode_DecodeUTF8(s, size, "replace");
1256 }
1257#endif
1258 return PyUnicode_Decode(s, size,
1259 Py_FileSystemDefaultEncoding,
1260 "replace");
1261 }
1262 else {
1263 return PyUnicode_DecodeUTF8(s, size, "replace");
1264 }
1265}
1266
Martin v. Löwis5b222132007-06-10 09:51:05 +00001267char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001268PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001269{
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001270 PyObject *str8;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001271 if (!PyUnicode_Check(unicode)) {
1272 PyErr_BadArgument();
1273 return NULL;
1274 }
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001275 str8 = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1276 if (str8 == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001277 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001278 if (psize != NULL)
1279 *psize = PyString_GET_SIZE(str8);
1280 return PyString_AS_STRING(str8);
1281}
1282
1283char*
1284PyUnicode_AsString(PyObject *unicode)
1285{
1286 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001287}
1288
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1290{
1291 if (!PyUnicode_Check(unicode)) {
1292 PyErr_BadArgument();
1293 goto onError;
1294 }
1295 return PyUnicode_AS_UNICODE(unicode);
1296
1297 onError:
1298 return NULL;
1299}
1300
Martin v. Löwis18e16552006-02-15 17:27:45 +00001301Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001302{
1303 if (!PyUnicode_Check(unicode)) {
1304 PyErr_BadArgument();
1305 goto onError;
1306 }
1307 return PyUnicode_GET_SIZE(unicode);
1308
1309 onError:
1310 return -1;
1311}
1312
Thomas Wouters78890102000-07-22 19:25:51 +00001313const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001314{
1315 return unicode_default_encoding;
1316}
1317
1318int PyUnicode_SetDefaultEncoding(const char *encoding)
1319{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001320 if (strcmp(encoding, unicode_default_encoding) != 0) {
1321 PyErr_Format(PyExc_ValueError,
1322 "Can only set default encoding to %s",
1323 unicode_default_encoding);
1324 return -1;
1325 }
Fred Drakee4315f52000-05-09 19:53:39 +00001326 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001327}
1328
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001329/* error handling callback helper:
1330 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001331 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001332 and adjust various state variables.
1333 return 0 on success, -1 on error
1334*/
1335
1336static
1337int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1338 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001339 const char **input, const char **inend, Py_ssize_t *startinpos,
1340 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001341 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001342{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001343 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344
1345 PyObject *restuple = NULL;
1346 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001347 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001348 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001349 Py_ssize_t requiredsize;
1350 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001351 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001352 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001353 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001354 int res = -1;
1355
1356 if (*errorHandler == NULL) {
1357 *errorHandler = PyCodec_LookupError(errors);
1358 if (*errorHandler == NULL)
1359 goto onError;
1360 }
1361
1362 if (*exceptionObject == NULL) {
1363 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001364 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001365 if (*exceptionObject == NULL)
1366 goto onError;
1367 }
1368 else {
1369 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1370 goto onError;
1371 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1372 goto onError;
1373 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1374 goto onError;
1375 }
1376
1377 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1378 if (restuple == NULL)
1379 goto onError;
1380 if (!PyTuple_Check(restuple)) {
1381 PyErr_Format(PyExc_TypeError, &argparse[4]);
1382 goto onError;
1383 }
1384 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1385 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001386
1387 /* Copy back the bytes variables, which might have been modified by the
1388 callback */
1389 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1390 if (!inputobj)
1391 goto onError;
1392 if (!PyBytes_Check(inputobj)) {
1393 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1394 }
1395 *input = PyBytes_AS_STRING(inputobj);
1396 insize = PyBytes_GET_SIZE(inputobj);
1397 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001398 /* we can DECREF safely, as the exception has another reference,
1399 so the object won't go away. */
1400 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001401
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001402 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001403 newpos = insize+newpos;
1404 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001405 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001406 goto onError;
1407 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001408
1409 /* need more space? (at least enough for what we
1410 have+the replacement+the rest of the string (starting
1411 at the new input position), so we won't have to check space
1412 when there are no errors in the rest of the string) */
1413 repptr = PyUnicode_AS_UNICODE(repunicode);
1414 repsize = PyUnicode_GET_SIZE(repunicode);
1415 requiredsize = *outpos + repsize + insize-newpos;
1416 if (requiredsize > outsize) {
1417 if (requiredsize<2*outsize)
1418 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001419 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420 goto onError;
1421 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1422 }
1423 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001424 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001425 Py_UNICODE_COPY(*outptr, repptr, repsize);
1426 *outptr += repsize;
1427 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001428
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001429 /* we made it! */
1430 res = 0;
1431
1432 onError:
1433 Py_XDECREF(restuple);
1434 return res;
1435}
1436
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001437/* --- UTF-7 Codec -------------------------------------------------------- */
1438
1439/* see RFC2152 for details */
1440
Tim Petersced69f82003-09-16 20:30:58 +00001441static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001442char utf7_special[128] = {
1443 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1444 encoded:
1445 0 - not special
1446 1 - special
1447 2 - whitespace (optional)
1448 3 - RFC2152 Set O (optional) */
1449 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1450 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1451 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1452 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1453 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1454 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1455 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1456 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1457
1458};
1459
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001460/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1461 warnings about the comparison always being false; since
1462 utf7_special[0] is 1, we can safely make that one comparison
1463 true */
1464
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001465#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001466 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001467 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001468 (encodeO && (utf7_special[(c)] == 3)))
1469
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001470#define B64(n) \
1471 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1472#define B64CHAR(c) \
1473 (isalnum(c) || (c) == '+' || (c) == '/')
1474#define UB64(c) \
1475 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1476 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001477
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001478#define ENCODE(out, ch, bits) \
1479 while (bits >= 6) { \
1480 *out++ = B64(ch >> (bits-6)); \
1481 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001482 }
1483
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001484#define DECODE(out, ch, bits, surrogate) \
1485 while (bits >= 16) { \
1486 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1487 bits -= 16; \
1488 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001489 /* We have already generated an error for the high surrogate \
1490 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001491 surrogate = 0; \
1492 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001493 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001494 it in a 16-bit character */ \
1495 surrogate = 1; \
1496 errmsg = "code pairs are not supported"; \
1497 goto utf7Error; \
1498 } else { \
1499 *out++ = outCh; \
1500 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001501 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001502
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001503PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001504 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001505 const char *errors)
1506{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001507 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001508 Py_ssize_t startinpos;
1509 Py_ssize_t endinpos;
1510 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001511 const char *e;
1512 PyUnicodeObject *unicode;
1513 Py_UNICODE *p;
1514 const char *errmsg = "";
1515 int inShift = 0;
1516 unsigned int bitsleft = 0;
1517 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001518 int surrogate = 0;
1519 PyObject *errorHandler = NULL;
1520 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001521
1522 unicode = _PyUnicode_New(size);
1523 if (!unicode)
1524 return NULL;
1525 if (size == 0)
1526 return (PyObject *)unicode;
1527
1528 p = unicode->str;
1529 e = s + size;
1530
1531 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001532 Py_UNICODE ch;
1533 restart:
1534 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001535
1536 if (inShift) {
1537 if ((ch == '-') || !B64CHAR(ch)) {
1538 inShift = 0;
1539 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001540
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001541 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1542 if (bitsleft >= 6) {
1543 /* The shift sequence has a partial character in it. If
1544 bitsleft < 6 then we could just classify it as padding
1545 but that is not the case here */
1546
1547 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001548 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001549 }
1550 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001551 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001552 here so indicate the potential of a misencoded character. */
1553
1554 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1555 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1556 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001557 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001558 }
1559
1560 if (ch == '-') {
1561 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001562 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563 inShift = 1;
1564 }
1565 } else if (SPECIAL(ch,0,0)) {
1566 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001567 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001568 } else {
1569 *p++ = ch;
1570 }
1571 } else {
1572 charsleft = (charsleft << 6) | UB64(ch);
1573 bitsleft += 6;
1574 s++;
1575 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1576 }
1577 }
1578 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001579 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001580 s++;
1581 if (s < e && *s == '-') {
1582 s++;
1583 *p++ = '+';
1584 } else
1585 {
1586 inShift = 1;
1587 bitsleft = 0;
1588 }
1589 }
1590 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001591 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001592 errmsg = "unexpected special character";
1593 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001594 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001595 }
1596 else {
1597 *p++ = ch;
1598 s++;
1599 }
1600 continue;
1601 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001602 outpos = p-PyUnicode_AS_UNICODE(unicode);
1603 endinpos = s-starts;
1604 if (unicode_decode_call_errorhandler(
1605 errors, &errorHandler,
1606 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001607 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001608 (PyObject **)&unicode, &outpos, &p))
1609 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610 }
1611
1612 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001613 outpos = p-PyUnicode_AS_UNICODE(unicode);
1614 endinpos = size;
1615 if (unicode_decode_call_errorhandler(
1616 errors, &errorHandler,
1617 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001618 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001619 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001620 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001621 if (s < e)
1622 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001623 }
1624
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001625 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001626 goto onError;
1627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001628 Py_XDECREF(errorHandler);
1629 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001630 return (PyObject *)unicode;
1631
1632onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001633 Py_XDECREF(errorHandler);
1634 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001635 Py_DECREF(unicode);
1636 return NULL;
1637}
1638
1639
1640PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001641 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001642 int encodeSetO,
1643 int encodeWhiteSpace,
1644 const char *errors)
1645{
1646 PyObject *v;
1647 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001648 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001649 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001650 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001651 unsigned int bitsleft = 0;
1652 unsigned long charsleft = 0;
1653 char * out;
1654 char * start;
1655
1656 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001657 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001658
Walter Dörwald51ab4142007-05-05 14:43:36 +00001659 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001660 if (v == NULL)
1661 return NULL;
1662
Walter Dörwald51ab4142007-05-05 14:43:36 +00001663 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001664 for (;i < size; ++i) {
1665 Py_UNICODE ch = s[i];
1666
1667 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001668 if (ch == '+') {
1669 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001670 *out++ = '-';
1671 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1672 charsleft = ch;
1673 bitsleft = 16;
1674 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001675 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001676 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001677 } else {
1678 *out++ = (char) ch;
1679 }
1680 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001681 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1682 *out++ = B64(charsleft << (6-bitsleft));
1683 charsleft = 0;
1684 bitsleft = 0;
1685 /* Characters not in the BASE64 set implicitly unshift the sequence
1686 so no '-' is required, except if the character is itself a '-' */
1687 if (B64CHAR(ch) || ch == '-') {
1688 *out++ = '-';
1689 }
1690 inShift = 0;
1691 *out++ = (char) ch;
1692 } else {
1693 bitsleft += 16;
1694 charsleft = (charsleft << 16) | ch;
1695 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1696
1697 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001698 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001699 or '-' then the shift sequence will be terminated implicitly and we
1700 don't have to insert a '-'. */
1701
1702 if (bitsleft == 0) {
1703 if (i + 1 < size) {
1704 Py_UNICODE ch2 = s[i+1];
1705
1706 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001707
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001708 } else if (B64CHAR(ch2) || ch2 == '-') {
1709 *out++ = '-';
1710 inShift = 0;
1711 } else {
1712 inShift = 0;
1713 }
1714
1715 }
1716 else {
1717 *out++ = '-';
1718 inShift = 0;
1719 }
1720 }
Tim Petersced69f82003-09-16 20:30:58 +00001721 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001722 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001723 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001724 if (bitsleft) {
1725 *out++= B64(charsleft << (6-bitsleft) );
1726 *out++ = '-';
1727 }
1728
Walter Dörwald51ab4142007-05-05 14:43:36 +00001729 if (PyBytes_Resize(v, out - start)) {
1730 Py_DECREF(v);
1731 return NULL;
1732 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001733 return v;
1734}
1735
1736#undef SPECIAL
1737#undef B64
1738#undef B64CHAR
1739#undef UB64
1740#undef ENCODE
1741#undef DECODE
1742
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743/* --- UTF-8 Codec -------------------------------------------------------- */
1744
Tim Petersced69f82003-09-16 20:30:58 +00001745static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001746char utf8_code_length[256] = {
1747 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1748 illegal prefix. see RFC 2279 for details */
1749 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1750 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1751 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1752 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1753 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1754 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1755 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1756 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1757 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1758 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1759 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1760 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1761 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1762 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1763 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1764 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1765};
1766
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001768 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769 const char *errors)
1770{
Walter Dörwald69652032004-09-07 20:24:22 +00001771 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1772}
1773
1774PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001775 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001776 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001777 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001778{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001781 Py_ssize_t startinpos;
1782 Py_ssize_t endinpos;
1783 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 const char *e;
1785 PyUnicodeObject *unicode;
1786 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001787 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001788 PyObject *errorHandler = NULL;
1789 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790
1791 /* Note: size will always be longer than the resulting Unicode
1792 character count */
1793 unicode = _PyUnicode_New(size);
1794 if (!unicode)
1795 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001796 if (size == 0) {
1797 if (consumed)
1798 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001800 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801
1802 /* Unpack UTF-8 encoded data */
1803 p = unicode->str;
1804 e = s + size;
1805
1806 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001807 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808
1809 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001810 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811 s++;
1812 continue;
1813 }
1814
1815 n = utf8_code_length[ch];
1816
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001817 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001818 if (consumed)
1819 break;
1820 else {
1821 errmsg = "unexpected end of data";
1822 startinpos = s-starts;
1823 endinpos = size;
1824 goto utf8Error;
1825 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001826 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827
1828 switch (n) {
1829
1830 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001831 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001832 startinpos = s-starts;
1833 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001834 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835
1836 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001837 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001838 startinpos = s-starts;
1839 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001840 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841
1842 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001843 if ((s[1] & 0xc0) != 0x80) {
1844 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 startinpos = s-starts;
1846 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001847 goto utf8Error;
1848 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001850 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001851 startinpos = s-starts;
1852 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001853 errmsg = "illegal encoding";
1854 goto utf8Error;
1855 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001857 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858 break;
1859
1860 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001861 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001862 (s[2] & 0xc0) != 0x80) {
1863 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001864 startinpos = s-starts;
1865 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001866 goto utf8Error;
1867 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001869 if (ch < 0x0800) {
1870 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001871 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001872
1873 XXX For wide builds (UCS-4) we should probably try
1874 to recombine the surrogates into a single code
1875 unit.
1876 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001877 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001878 startinpos = s-starts;
1879 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001880 goto utf8Error;
1881 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001882 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001883 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001884 break;
1885
1886 case 4:
1887 if ((s[1] & 0xc0) != 0x80 ||
1888 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001889 (s[3] & 0xc0) != 0x80) {
1890 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001891 startinpos = s-starts;
1892 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001893 goto utf8Error;
1894 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001895 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1896 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1897 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001898 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001899 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001900 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001901 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001902 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001903 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001904 startinpos = s-starts;
1905 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001906 goto utf8Error;
1907 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001908#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001909 *p++ = (Py_UNICODE)ch;
1910#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001911 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001912
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001913 /* translate from 10000..10FFFF to 0..FFFF */
1914 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001915
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001916 /* high surrogate = top 10 bits added to D800 */
1917 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001918
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001919 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001920 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001921#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001922 break;
1923
1924 default:
1925 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001926 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001927 startinpos = s-starts;
1928 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001929 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930 }
1931 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001932 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001933
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001934 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001935 outpos = p-PyUnicode_AS_UNICODE(unicode);
1936 if (unicode_decode_call_errorhandler(
1937 errors, &errorHandler,
1938 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001939 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001940 (PyObject **)&unicode, &outpos, &p))
1941 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001942 }
Walter Dörwald69652032004-09-07 20:24:22 +00001943 if (consumed)
1944 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945
1946 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001947 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001948 goto onError;
1949
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001950 Py_XDECREF(errorHandler);
1951 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952 return (PyObject *)unicode;
1953
1954onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001955 Py_XDECREF(errorHandler);
1956 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957 Py_DECREF(unicode);
1958 return NULL;
1959}
1960
Tim Peters602f7402002-04-27 18:03:26 +00001961/* Allocation strategy: if the string is short, convert into a stack buffer
1962 and allocate exactly as much space needed at the end. Else allocate the
1963 maximum possible needed (4 result bytes per Unicode character), and return
1964 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001965*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001966PyObject *
1967PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001968 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001969 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970{
Tim Peters602f7402002-04-27 18:03:26 +00001971#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001972
Martin v. Löwis18e16552006-02-15 17:27:45 +00001973 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001974 PyObject *v; /* result string object */
1975 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001976 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001977 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001978 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001979
Tim Peters602f7402002-04-27 18:03:26 +00001980 assert(s != NULL);
1981 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982
Tim Peters602f7402002-04-27 18:03:26 +00001983 if (size <= MAX_SHORT_UNICHARS) {
1984 /* Write into the stack buffer; nallocated can't overflow.
1985 * At the end, we'll allocate exactly as much heap space as it
1986 * turns out we need.
1987 */
1988 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1989 v = NULL; /* will allocate after we're done */
1990 p = stackbuf;
1991 }
1992 else {
1993 /* Overallocate on the heap, and give the excess back at the end. */
1994 nallocated = size * 4;
1995 if (nallocated / 4 != size) /* overflow! */
1996 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001997 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001998 if (v == NULL)
1999 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002000 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002001 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002002
Tim Peters602f7402002-04-27 18:03:26 +00002003 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002004 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002005
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002006 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002007 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002009
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002011 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002012 *p++ = (char)(0xc0 | (ch >> 6));
2013 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002014 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002015 else {
Tim Peters602f7402002-04-27 18:03:26 +00002016 /* Encode UCS2 Unicode ordinals */
2017 if (ch < 0x10000) {
2018 /* Special case: check for high surrogate */
2019 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2020 Py_UCS4 ch2 = s[i];
2021 /* Check for low surrogate and combine the two to
2022 form a UCS4 value */
2023 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002024 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002025 i++;
2026 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002027 }
Tim Peters602f7402002-04-27 18:03:26 +00002028 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002029 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002030 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002031 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2032 *p++ = (char)(0x80 | (ch & 0x3f));
2033 continue;
2034 }
2035encodeUCS4:
2036 /* Encode UCS4 Unicode ordinals */
2037 *p++ = (char)(0xf0 | (ch >> 18));
2038 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2039 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2040 *p++ = (char)(0x80 | (ch & 0x3f));
2041 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002043
Tim Peters602f7402002-04-27 18:03:26 +00002044 if (v == NULL) {
2045 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002046 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002047 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002048 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002049 }
2050 else {
2051 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002052 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002053 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002054 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002055 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002057
Tim Peters602f7402002-04-27 18:03:26 +00002058#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059}
2060
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2062{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063 if (!PyUnicode_Check(unicode)) {
2064 PyErr_BadArgument();
2065 return NULL;
2066 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002067 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2068 PyUnicode_GET_SIZE(unicode),
2069 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070}
2071
Walter Dörwald41980ca2007-08-16 21:55:45 +00002072/* --- UTF-32 Codec ------------------------------------------------------- */
2073
2074PyObject *
2075PyUnicode_DecodeUTF32(const char *s,
2076 Py_ssize_t size,
2077 const char *errors,
2078 int *byteorder)
2079{
2080 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2081}
2082
2083PyObject *
2084PyUnicode_DecodeUTF32Stateful(const char *s,
2085 Py_ssize_t size,
2086 const char *errors,
2087 int *byteorder,
2088 Py_ssize_t *consumed)
2089{
2090 const char *starts = s;
2091 Py_ssize_t startinpos;
2092 Py_ssize_t endinpos;
2093 Py_ssize_t outpos;
2094 PyUnicodeObject *unicode;
2095 Py_UNICODE *p;
2096#ifndef Py_UNICODE_WIDE
2097 int i, pairs;
2098#else
2099 const int pairs = 0;
2100#endif
2101 const unsigned char *q, *e;
2102 int bo = 0; /* assume native ordering by default */
2103 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002104 /* Offsets from q for retrieving bytes in the right order. */
2105#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2106 int iorder[] = {0, 1, 2, 3};
2107#else
2108 int iorder[] = {3, 2, 1, 0};
2109#endif
2110 PyObject *errorHandler = NULL;
2111 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002112 /* On narrow builds we split characters outside the BMP into two
2113 codepoints => count how much extra space we need. */
2114#ifndef Py_UNICODE_WIDE
2115 for (i = pairs = 0; i < size/4; i++)
2116 if (((Py_UCS4 *)s)[i] >= 0x10000)
2117 pairs++;
2118#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002119
2120 /* This might be one to much, because of a BOM */
2121 unicode = _PyUnicode_New((size+3)/4+pairs);
2122 if (!unicode)
2123 return NULL;
2124 if (size == 0)
2125 return (PyObject *)unicode;
2126
2127 /* Unpack UTF-32 encoded data */
2128 p = unicode->str;
2129 q = (unsigned char *)s;
2130 e = q + size;
2131
2132 if (byteorder)
2133 bo = *byteorder;
2134
2135 /* Check for BOM marks (U+FEFF) in the input and adjust current
2136 byte order setting accordingly. In native mode, the leading BOM
2137 mark is skipped, in all other modes, it is copied to the output
2138 stream as-is (giving a ZWNBSP character). */
2139 if (bo == 0) {
2140 if (size >= 4) {
2141 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2142 (q[iorder[1]] << 8) | q[iorder[0]];
2143#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2144 if (bom == 0x0000FEFF) {
2145 q += 4;
2146 bo = -1;
2147 }
2148 else if (bom == 0xFFFE0000) {
2149 q += 4;
2150 bo = 1;
2151 }
2152#else
2153 if (bom == 0x0000FEFF) {
2154 q += 4;
2155 bo = 1;
2156 }
2157 else if (bom == 0xFFFE0000) {
2158 q += 4;
2159 bo = -1;
2160 }
2161#endif
2162 }
2163 }
2164
2165 if (bo == -1) {
2166 /* force LE */
2167 iorder[0] = 0;
2168 iorder[1] = 1;
2169 iorder[2] = 2;
2170 iorder[3] = 3;
2171 }
2172 else if (bo == 1) {
2173 /* force BE */
2174 iorder[0] = 3;
2175 iorder[1] = 2;
2176 iorder[2] = 1;
2177 iorder[3] = 0;
2178 }
2179
2180 while (q < e) {
2181 Py_UCS4 ch;
2182 /* remaining bytes at the end? (size should be divisible by 4) */
2183 if (e-q<4) {
2184 if (consumed)
2185 break;
2186 errmsg = "truncated data";
2187 startinpos = ((const char *)q)-starts;
2188 endinpos = ((const char *)e)-starts;
2189 goto utf32Error;
2190 /* The remaining input chars are ignored if the callback
2191 chooses to skip the input */
2192 }
2193 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2194 (q[iorder[1]] << 8) | q[iorder[0]];
2195
2196 if (ch >= 0x110000)
2197 {
2198 errmsg = "codepoint not in range(0x110000)";
2199 startinpos = ((const char *)q)-starts;
2200 endinpos = startinpos+4;
2201 goto utf32Error;
2202 }
2203#ifndef Py_UNICODE_WIDE
2204 if (ch >= 0x10000)
2205 {
2206 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2207 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2208 }
2209 else
2210#endif
2211 *p++ = ch;
2212 q += 4;
2213 continue;
2214 utf32Error:
2215 outpos = p-PyUnicode_AS_UNICODE(unicode);
2216 if (unicode_decode_call_errorhandler(
2217 errors, &errorHandler,
2218 "utf32", errmsg,
2219 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2220 (PyObject **)&unicode, &outpos, &p))
2221 goto onError;
2222 }
2223
2224 if (byteorder)
2225 *byteorder = bo;
2226
2227 if (consumed)
2228 *consumed = (const char *)q-starts;
2229
2230 /* Adjust length */
2231 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2232 goto onError;
2233
2234 Py_XDECREF(errorHandler);
2235 Py_XDECREF(exc);
2236 return (PyObject *)unicode;
2237
2238onError:
2239 Py_DECREF(unicode);
2240 Py_XDECREF(errorHandler);
2241 Py_XDECREF(exc);
2242 return NULL;
2243}
2244
2245PyObject *
2246PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2247 Py_ssize_t size,
2248 const char *errors,
2249 int byteorder)
2250{
2251 PyObject *v;
2252 unsigned char *p;
2253#ifndef Py_UNICODE_WIDE
2254 int i, pairs;
2255#else
2256 const int pairs = 0;
2257#endif
2258 /* Offsets from p for storing byte pairs in the right order. */
2259#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2260 int iorder[] = {0, 1, 2, 3};
2261#else
2262 int iorder[] = {3, 2, 1, 0};
2263#endif
2264
2265#define STORECHAR(CH) \
2266 do { \
2267 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2268 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2269 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2270 p[iorder[0]] = (CH) & 0xff; \
2271 p += 4; \
2272 } while(0)
2273
2274 /* In narrow builds we can output surrogate pairs as one codepoint,
2275 so we need less space. */
2276#ifndef Py_UNICODE_WIDE
2277 for (i = pairs = 0; i < size-1; i++)
2278 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2279 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2280 pairs++;
2281#endif
2282 v = PyBytes_FromStringAndSize(NULL,
2283 4 * (size - pairs + (byteorder == 0)));
2284 if (v == NULL)
2285 return NULL;
2286
2287 p = (unsigned char *)PyBytes_AS_STRING(v);
2288 if (byteorder == 0)
2289 STORECHAR(0xFEFF);
2290 if (size == 0)
2291 return v;
2292
2293 if (byteorder == -1) {
2294 /* force LE */
2295 iorder[0] = 0;
2296 iorder[1] = 1;
2297 iorder[2] = 2;
2298 iorder[3] = 3;
2299 }
2300 else if (byteorder == 1) {
2301 /* force BE */
2302 iorder[0] = 3;
2303 iorder[1] = 2;
2304 iorder[2] = 1;
2305 iorder[3] = 0;
2306 }
2307
2308 while (size-- > 0) {
2309 Py_UCS4 ch = *s++;
2310#ifndef Py_UNICODE_WIDE
2311 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2312 Py_UCS4 ch2 = *s;
2313 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2314 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2315 s++;
2316 size--;
2317 }
2318 }
2319#endif
2320 STORECHAR(ch);
2321 }
2322 return v;
2323#undef STORECHAR
2324}
2325
2326PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2327{
2328 if (!PyUnicode_Check(unicode)) {
2329 PyErr_BadArgument();
2330 return NULL;
2331 }
2332 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2333 PyUnicode_GET_SIZE(unicode),
2334 NULL,
2335 0);
2336}
2337
Guido van Rossumd57fd912000-03-10 22:53:23 +00002338/* --- UTF-16 Codec ------------------------------------------------------- */
2339
Tim Peters772747b2001-08-09 22:21:55 +00002340PyObject *
2341PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002342 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002343 const char *errors,
2344 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002345{
Walter Dörwald69652032004-09-07 20:24:22 +00002346 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2347}
2348
2349PyObject *
2350PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002351 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002352 const char *errors,
2353 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002354 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002355{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002356 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002357 Py_ssize_t startinpos;
2358 Py_ssize_t endinpos;
2359 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002360 PyUnicodeObject *unicode;
2361 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002362 const unsigned char *q, *e;
2363 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002364 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002365 /* Offsets from q for retrieving byte pairs in the right order. */
2366#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2367 int ihi = 1, ilo = 0;
2368#else
2369 int ihi = 0, ilo = 1;
2370#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002371 PyObject *errorHandler = NULL;
2372 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002373
2374 /* Note: size will always be longer than the resulting Unicode
2375 character count */
2376 unicode = _PyUnicode_New(size);
2377 if (!unicode)
2378 return NULL;
2379 if (size == 0)
2380 return (PyObject *)unicode;
2381
2382 /* Unpack UTF-16 encoded data */
2383 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002384 q = (unsigned char *)s;
2385 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002386
2387 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002388 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002389
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002390 /* Check for BOM marks (U+FEFF) in the input and adjust current
2391 byte order setting accordingly. In native mode, the leading BOM
2392 mark is skipped, in all other modes, it is copied to the output
2393 stream as-is (giving a ZWNBSP character). */
2394 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002395 if (size >= 2) {
2396 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002397#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002398 if (bom == 0xFEFF) {
2399 q += 2;
2400 bo = -1;
2401 }
2402 else if (bom == 0xFFFE) {
2403 q += 2;
2404 bo = 1;
2405 }
Tim Petersced69f82003-09-16 20:30:58 +00002406#else
Walter Dörwald69652032004-09-07 20:24:22 +00002407 if (bom == 0xFEFF) {
2408 q += 2;
2409 bo = 1;
2410 }
2411 else if (bom == 0xFFFE) {
2412 q += 2;
2413 bo = -1;
2414 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002415#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002416 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418
Tim Peters772747b2001-08-09 22:21:55 +00002419 if (bo == -1) {
2420 /* force LE */
2421 ihi = 1;
2422 ilo = 0;
2423 }
2424 else if (bo == 1) {
2425 /* force BE */
2426 ihi = 0;
2427 ilo = 1;
2428 }
2429
2430 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002431 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002432 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002433 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002434 if (consumed)
2435 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002436 errmsg = "truncated data";
2437 startinpos = ((const char *)q)-starts;
2438 endinpos = ((const char *)e)-starts;
2439 goto utf16Error;
2440 /* The remaining input chars are ignored if the callback
2441 chooses to skip the input */
2442 }
2443 ch = (q[ihi] << 8) | q[ilo];
2444
Tim Peters772747b2001-08-09 22:21:55 +00002445 q += 2;
2446
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447 if (ch < 0xD800 || ch > 0xDFFF) {
2448 *p++ = ch;
2449 continue;
2450 }
2451
2452 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002453 if (q >= e) {
2454 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002455 startinpos = (((const char *)q)-2)-starts;
2456 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002457 goto utf16Error;
2458 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002459 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002460 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2461 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002462 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002463#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002464 *p++ = ch;
2465 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002466#else
2467 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002468#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002469 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002470 }
2471 else {
2472 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002473 startinpos = (((const char *)q)-4)-starts;
2474 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002475 goto utf16Error;
2476 }
2477
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002479 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002480 startinpos = (((const char *)q)-2)-starts;
2481 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002482 /* Fall through to report the error */
2483
2484 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002485 outpos = p-PyUnicode_AS_UNICODE(unicode);
2486 if (unicode_decode_call_errorhandler(
2487 errors, &errorHandler,
2488 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002489 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002490 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002491 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492 }
2493
2494 if (byteorder)
2495 *byteorder = bo;
2496
Walter Dörwald69652032004-09-07 20:24:22 +00002497 if (consumed)
2498 *consumed = (const char *)q-starts;
2499
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002501 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502 goto onError;
2503
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002504 Py_XDECREF(errorHandler);
2505 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506 return (PyObject *)unicode;
2507
2508onError:
2509 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002510 Py_XDECREF(errorHandler);
2511 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 return NULL;
2513}
2514
Tim Peters772747b2001-08-09 22:21:55 +00002515PyObject *
2516PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002517 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002518 const char *errors,
2519 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520{
2521 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002522 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002523#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002524 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002525#else
2526 const int pairs = 0;
2527#endif
Tim Peters772747b2001-08-09 22:21:55 +00002528 /* Offsets from p for storing byte pairs in the right order. */
2529#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2530 int ihi = 1, ilo = 0;
2531#else
2532 int ihi = 0, ilo = 1;
2533#endif
2534
2535#define STORECHAR(CH) \
2536 do { \
2537 p[ihi] = ((CH) >> 8) & 0xff; \
2538 p[ilo] = (CH) & 0xff; \
2539 p += 2; \
2540 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002542#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002543 for (i = pairs = 0; i < size; i++)
2544 if (s[i] >= 0x10000)
2545 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002546#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002547 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002548 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 if (v == NULL)
2550 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551
Walter Dörwald3cc34522007-05-04 10:48:27 +00002552 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002554 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002555 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002556 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002557
2558 if (byteorder == -1) {
2559 /* force LE */
2560 ihi = 1;
2561 ilo = 0;
2562 }
2563 else if (byteorder == 1) {
2564 /* force BE */
2565 ihi = 0;
2566 ilo = 1;
2567 }
2568
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002569 while (size-- > 0) {
2570 Py_UNICODE ch = *s++;
2571 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002572#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002573 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002574 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2575 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002577#endif
Tim Peters772747b2001-08-09 22:21:55 +00002578 STORECHAR(ch);
2579 if (ch2)
2580 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002583#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584}
2585
2586PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2587{
2588 if (!PyUnicode_Check(unicode)) {
2589 PyErr_BadArgument();
2590 return NULL;
2591 }
2592 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2593 PyUnicode_GET_SIZE(unicode),
2594 NULL,
2595 0);
2596}
2597
2598/* --- Unicode Escape Codec ----------------------------------------------- */
2599
Fredrik Lundh06d12682001-01-24 07:59:11 +00002600static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002601
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002603 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 const char *errors)
2605{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002606 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002607 Py_ssize_t startinpos;
2608 Py_ssize_t endinpos;
2609 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002610 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002612 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002613 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002614 char* message;
2615 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002616 PyObject *errorHandler = NULL;
2617 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002618
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619 /* Escaped strings will always be longer than the resulting
2620 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002621 length after conversion to the true value.
2622 (but if the error callback returns a long replacement string
2623 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624 v = _PyUnicode_New(size);
2625 if (v == NULL)
2626 goto onError;
2627 if (size == 0)
2628 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002629
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002630 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002631 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002632
Guido van Rossumd57fd912000-03-10 22:53:23 +00002633 while (s < end) {
2634 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002635 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002636 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002637
2638 /* Non-escape characters are interpreted as Unicode ordinals */
2639 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002640 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641 continue;
2642 }
2643
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002644 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002645 /* \ - Escapes */
2646 s++;
2647 switch (*s++) {
2648
2649 /* \x escapes */
2650 case '\n': break;
2651 case '\\': *p++ = '\\'; break;
2652 case '\'': *p++ = '\''; break;
2653 case '\"': *p++ = '\"'; break;
2654 case 'b': *p++ = '\b'; break;
2655 case 'f': *p++ = '\014'; break; /* FF */
2656 case 't': *p++ = '\t'; break;
2657 case 'n': *p++ = '\n'; break;
2658 case 'r': *p++ = '\r'; break;
2659 case 'v': *p++ = '\013'; break; /* VT */
2660 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2661
2662 /* \OOO (octal) escapes */
2663 case '0': case '1': case '2': case '3':
2664 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002665 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002667 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002669 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002670 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002671 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672 break;
2673
Fredrik Lundhccc74732001-02-18 22:13:49 +00002674 /* hex escapes */
2675 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002677 digits = 2;
2678 message = "truncated \\xXX escape";
2679 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680
Fredrik Lundhccc74732001-02-18 22:13:49 +00002681 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002683 digits = 4;
2684 message = "truncated \\uXXXX escape";
2685 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686
Fredrik Lundhccc74732001-02-18 22:13:49 +00002687 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002688 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002689 digits = 8;
2690 message = "truncated \\UXXXXXXXX escape";
2691 hexescape:
2692 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002693 outpos = p-PyUnicode_AS_UNICODE(v);
2694 if (s+digits>end) {
2695 endinpos = size;
2696 if (unicode_decode_call_errorhandler(
2697 errors, &errorHandler,
2698 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002699 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002700 (PyObject **)&v, &outpos, &p))
2701 goto onError;
2702 goto nextByte;
2703 }
2704 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002705 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002706 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 endinpos = (s+i+1)-starts;
2708 if (unicode_decode_call_errorhandler(
2709 errors, &errorHandler,
2710 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002711 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002712 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002713 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002714 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002715 }
2716 chr = (chr<<4) & ~0xF;
2717 if (c >= '0' && c <= '9')
2718 chr += c - '0';
2719 else if (c >= 'a' && c <= 'f')
2720 chr += 10 + c - 'a';
2721 else
2722 chr += 10 + c - 'A';
2723 }
2724 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002725 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002726 /* _decoding_error will have already written into the
2727 target buffer. */
2728 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002729 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002730 /* when we get here, chr is a 32-bit unicode character */
2731 if (chr <= 0xffff)
2732 /* UCS-2 character */
2733 *p++ = (Py_UNICODE) chr;
2734 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002735 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002736 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002737#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002738 *p++ = chr;
2739#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002740 chr -= 0x10000L;
2741 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002742 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002743#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002744 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002745 endinpos = s-starts;
2746 outpos = p-PyUnicode_AS_UNICODE(v);
2747 if (unicode_decode_call_errorhandler(
2748 errors, &errorHandler,
2749 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002750 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002751 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002752 goto onError;
2753 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002754 break;
2755
2756 /* \N{name} */
2757 case 'N':
2758 message = "malformed \\N character escape";
2759 if (ucnhash_CAPI == NULL) {
2760 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002761 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002762 m = PyImport_ImportModule("unicodedata");
2763 if (m == NULL)
2764 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002765 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002766 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002767 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002768 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002769 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002770 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002771 if (ucnhash_CAPI == NULL)
2772 goto ucnhashError;
2773 }
2774 if (*s == '{') {
2775 const char *start = s+1;
2776 /* look for the closing brace */
2777 while (*s != '}' && s < end)
2778 s++;
2779 if (s > start && s < end && *s == '}') {
2780 /* found a name. look it up in the unicode database */
2781 message = "unknown Unicode character name";
2782 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002783 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002784 goto store;
2785 }
2786 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002787 endinpos = s-starts;
2788 outpos = p-PyUnicode_AS_UNICODE(v);
2789 if (unicode_decode_call_errorhandler(
2790 errors, &errorHandler,
2791 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002792 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002793 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002794 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002795 break;
2796
2797 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002798 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002799 message = "\\ at end of string";
2800 s--;
2801 endinpos = s-starts;
2802 outpos = p-PyUnicode_AS_UNICODE(v);
2803 if (unicode_decode_call_errorhandler(
2804 errors, &errorHandler,
2805 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002806 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002807 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002808 goto onError;
2809 }
2810 else {
2811 *p++ = '\\';
2812 *p++ = (unsigned char)s[-1];
2813 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002814 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002816 nextByte:
2817 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002819 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002820 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002821 Py_XDECREF(errorHandler);
2822 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002824
Fredrik Lundhccc74732001-02-18 22:13:49 +00002825ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002826 PyErr_SetString(
2827 PyExc_UnicodeError,
2828 "\\N escapes not supported (can't load unicodedata module)"
2829 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002830 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002831 Py_XDECREF(errorHandler);
2832 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002833 return NULL;
2834
Fredrik Lundhccc74732001-02-18 22:13:49 +00002835onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002837 Py_XDECREF(errorHandler);
2838 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 return NULL;
2840}
2841
2842/* Return a Unicode-Escape string version of the Unicode object.
2843
2844 If quotes is true, the string is enclosed in u"" or u'' quotes as
2845 appropriate.
2846
2847*/
2848
Thomas Wouters477c8d52006-05-27 19:21:47 +00002849Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2850 Py_ssize_t size,
2851 Py_UNICODE ch)
2852{
2853 /* like wcschr, but doesn't stop at NULL characters */
2854
2855 while (size-- > 0) {
2856 if (*s == ch)
2857 return s;
2858 s++;
2859 }
2860
2861 return NULL;
2862}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002863
Walter Dörwald79e913e2007-05-12 11:08:06 +00002864static const char *hexdigits = "0123456789abcdef";
2865
2866PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2867 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002868{
2869 PyObject *repr;
2870 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002871
Thomas Wouters89f507f2006-12-13 04:49:30 +00002872 /* XXX(nnorwitz): rather than over-allocating, it would be
2873 better to choose a different scheme. Perhaps scan the
2874 first N-chars of the string and allocate based on that size.
2875 */
2876 /* Initial allocation is based on the longest-possible unichr
2877 escape.
2878
2879 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2880 unichr, so in this case it's the longest unichr escape. In
2881 narrow (UTF-16) builds this is five chars per source unichr
2882 since there are two unichrs in the surrogate pair, so in narrow
2883 (UTF-16) builds it's not the longest unichr escape.
2884
2885 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2886 so in the narrow (UTF-16) build case it's the longest unichr
2887 escape.
2888 */
2889
Walter Dörwald79e913e2007-05-12 11:08:06 +00002890 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002891#ifdef Py_UNICODE_WIDE
2892 + 10*size
2893#else
2894 + 6*size
2895#endif
2896 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002897 if (repr == NULL)
2898 return NULL;
2899
Walter Dörwald79e913e2007-05-12 11:08:06 +00002900 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902 while (size-- > 0) {
2903 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002904
Walter Dörwald79e913e2007-05-12 11:08:06 +00002905 /* Escape backslashes */
2906 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907 *p++ = '\\';
2908 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002909 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002910 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002911
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002912#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002913 /* Map 21-bit characters to '\U00xxxxxx' */
2914 else if (ch >= 0x10000) {
2915 *p++ = '\\';
2916 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002917 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2918 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2919 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2920 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2921 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2922 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2923 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2924 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002925 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002926 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002927#else
2928 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002929 else if (ch >= 0xD800 && ch < 0xDC00) {
2930 Py_UNICODE ch2;
2931 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002932
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002933 ch2 = *s++;
2934 size--;
2935 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2936 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2937 *p++ = '\\';
2938 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002939 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2940 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2941 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2942 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2943 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2944 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2945 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2946 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002947 continue;
2948 }
2949 /* Fall through: isolated surrogates are copied as-is */
2950 s--;
2951 size++;
2952 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002953#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002954
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002956 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002957 *p++ = '\\';
2958 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002959 *p++ = hexdigits[(ch >> 12) & 0x000F];
2960 *p++ = hexdigits[(ch >> 8) & 0x000F];
2961 *p++ = hexdigits[(ch >> 4) & 0x000F];
2962 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002964
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002965 /* Map special whitespace to '\t', \n', '\r' */
2966 else if (ch == '\t') {
2967 *p++ = '\\';
2968 *p++ = 't';
2969 }
2970 else if (ch == '\n') {
2971 *p++ = '\\';
2972 *p++ = 'n';
2973 }
2974 else if (ch == '\r') {
2975 *p++ = '\\';
2976 *p++ = 'r';
2977 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002978
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002979 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002980 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002982 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002983 *p++ = hexdigits[(ch >> 4) & 0x000F];
2984 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002985 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002986
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 /* Copy everything else as-is */
2988 else
2989 *p++ = (char) ch;
2990 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991
2992 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002993 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2994 Py_DECREF(repr);
2995 return NULL;
2996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997 return repr;
2998}
2999
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3001{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003002 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003 if (!PyUnicode_Check(unicode)) {
3004 PyErr_BadArgument();
3005 return NULL;
3006 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003007 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3008 PyUnicode_GET_SIZE(unicode));
3009
3010 if (!s)
3011 return NULL;
3012 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3013 PyBytes_GET_SIZE(s));
3014 Py_DECREF(s);
3015 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003016}
3017
3018/* --- Raw Unicode Escape Codec ------------------------------------------- */
3019
3020PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003021 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022 const char *errors)
3023{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003024 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003025 Py_ssize_t startinpos;
3026 Py_ssize_t endinpos;
3027 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003029 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 const char *end;
3031 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003032 PyObject *errorHandler = NULL;
3033 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003034
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 /* Escaped strings will always be longer than the resulting
3036 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003037 length after conversion to the true value. (But decoding error
3038 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 v = _PyUnicode_New(size);
3040 if (v == NULL)
3041 goto onError;
3042 if (size == 0)
3043 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003044 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045 end = s + size;
3046 while (s < end) {
3047 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003048 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003050 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051
3052 /* Non-escape characters are interpreted as Unicode ordinals */
3053 if (*s != '\\') {
3054 *p++ = (unsigned char)*s++;
3055 continue;
3056 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003057 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058
3059 /* \u-escapes are only interpreted iff the number of leading
3060 backslashes if odd */
3061 bs = s;
3062 for (;s < end;) {
3063 if (*s != '\\')
3064 break;
3065 *p++ = (unsigned char)*s++;
3066 }
3067 if (((s - bs) & 1) == 0 ||
3068 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003069 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003070 continue;
3071 }
3072 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003073 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003074 s++;
3075
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003076 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003077 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003078 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003079 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003081 endinpos = s-starts;
3082 if (unicode_decode_call_errorhandler(
3083 errors, &errorHandler,
3084 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003085 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003086 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003088 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 }
3090 x = (x<<4) & ~0xF;
3091 if (c >= '0' && c <= '9')
3092 x += c - '0';
3093 else if (c >= 'a' && c <= 'f')
3094 x += 10 + c - 'a';
3095 else
3096 x += 10 + c - 'A';
3097 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003098#ifndef Py_UNICODE_WIDE
3099 if (x > 0x10000) {
3100 if (unicode_decode_call_errorhandler(
3101 errors, &errorHandler,
3102 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003103 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003104 (PyObject **)&v, &outpos, &p))
3105 goto onError;
3106 }
3107#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003108 *p++ = x;
3109 nextByte:
3110 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003112 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003113 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003114 Py_XDECREF(errorHandler);
3115 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003117
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118 onError:
3119 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003120 Py_XDECREF(errorHandler);
3121 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122 return NULL;
3123}
3124
3125PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003126 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127{
3128 PyObject *repr;
3129 char *p;
3130 char *q;
3131
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003132#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003133 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003134#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003135 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003136#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137 if (repr == NULL)
3138 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003139 if (size == 0)
3140 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141
Walter Dörwald711005d2007-05-12 12:03:26 +00003142 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 while (size-- > 0) {
3144 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003145#ifdef Py_UNICODE_WIDE
3146 /* Map 32-bit characters to '\Uxxxxxxxx' */
3147 if (ch >= 0x10000) {
3148 *p++ = '\\';
3149 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003150 *p++ = hexdigits[(ch >> 28) & 0xf];
3151 *p++ = hexdigits[(ch >> 24) & 0xf];
3152 *p++ = hexdigits[(ch >> 20) & 0xf];
3153 *p++ = hexdigits[(ch >> 16) & 0xf];
3154 *p++ = hexdigits[(ch >> 12) & 0xf];
3155 *p++ = hexdigits[(ch >> 8) & 0xf];
3156 *p++ = hexdigits[(ch >> 4) & 0xf];
3157 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003158 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003159 else
3160#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 /* Map 16-bit characters to '\uxxxx' */
3162 if (ch >= 256) {
3163 *p++ = '\\';
3164 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003165 *p++ = hexdigits[(ch >> 12) & 0xf];
3166 *p++ = hexdigits[(ch >> 8) & 0xf];
3167 *p++ = hexdigits[(ch >> 4) & 0xf];
3168 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003169 }
3170 /* Copy everything else as-is */
3171 else
3172 *p++ = (char) ch;
3173 }
3174 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00003175 if (PyBytes_Resize(repr, p - q)) {
3176 Py_DECREF(repr);
3177 return NULL;
3178 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179 return repr;
3180}
3181
3182PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3183{
Walter Dörwald711005d2007-05-12 12:03:26 +00003184 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003186 PyErr_BadArgument();
3187 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003188 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003189 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3190 PyUnicode_GET_SIZE(unicode));
3191
3192 if (!s)
3193 return NULL;
3194 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3195 PyBytes_GET_SIZE(s));
3196 Py_DECREF(s);
3197 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198}
3199
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003200/* --- Unicode Internal Codec ------------------------------------------- */
3201
3202PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003203 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003204 const char *errors)
3205{
3206 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003207 Py_ssize_t startinpos;
3208 Py_ssize_t endinpos;
3209 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003210 PyUnicodeObject *v;
3211 Py_UNICODE *p;
3212 const char *end;
3213 const char *reason;
3214 PyObject *errorHandler = NULL;
3215 PyObject *exc = NULL;
3216
Neal Norwitzd43069c2006-01-08 01:12:10 +00003217#ifdef Py_UNICODE_WIDE
3218 Py_UNICODE unimax = PyUnicode_GetMax();
3219#endif
3220
Thomas Wouters89f507f2006-12-13 04:49:30 +00003221 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003222 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3223 if (v == NULL)
3224 goto onError;
3225 if (PyUnicode_GetSize((PyObject *)v) == 0)
3226 return (PyObject *)v;
3227 p = PyUnicode_AS_UNICODE(v);
3228 end = s + size;
3229
3230 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003231 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003232 /* We have to sanity check the raw data, otherwise doom looms for
3233 some malformed UCS-4 data. */
3234 if (
3235 #ifdef Py_UNICODE_WIDE
3236 *p > unimax || *p < 0 ||
3237 #endif
3238 end-s < Py_UNICODE_SIZE
3239 )
3240 {
3241 startinpos = s - starts;
3242 if (end-s < Py_UNICODE_SIZE) {
3243 endinpos = end-starts;
3244 reason = "truncated input";
3245 }
3246 else {
3247 endinpos = s - starts + Py_UNICODE_SIZE;
3248 reason = "illegal code point (> 0x10FFFF)";
3249 }
3250 outpos = p - PyUnicode_AS_UNICODE(v);
3251 if (unicode_decode_call_errorhandler(
3252 errors, &errorHandler,
3253 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003254 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003255 (PyObject **)&v, &outpos, &p)) {
3256 goto onError;
3257 }
3258 }
3259 else {
3260 p++;
3261 s += Py_UNICODE_SIZE;
3262 }
3263 }
3264
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003265 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003266 goto onError;
3267 Py_XDECREF(errorHandler);
3268 Py_XDECREF(exc);
3269 return (PyObject *)v;
3270
3271 onError:
3272 Py_XDECREF(v);
3273 Py_XDECREF(errorHandler);
3274 Py_XDECREF(exc);
3275 return NULL;
3276}
3277
Guido van Rossumd57fd912000-03-10 22:53:23 +00003278/* --- Latin-1 Codec ------------------------------------------------------ */
3279
3280PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003281 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003282 const char *errors)
3283{
3284 PyUnicodeObject *v;
3285 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003286
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003288 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003289 Py_UNICODE r = *(unsigned char*)s;
3290 return PyUnicode_FromUnicode(&r, 1);
3291 }
3292
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293 v = _PyUnicode_New(size);
3294 if (v == NULL)
3295 goto onError;
3296 if (size == 0)
3297 return (PyObject *)v;
3298 p = PyUnicode_AS_UNICODE(v);
3299 while (size-- > 0)
3300 *p++ = (unsigned char)*s++;
3301 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003302
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 onError:
3304 Py_XDECREF(v);
3305 return NULL;
3306}
3307
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003308/* create or adjust a UnicodeEncodeError */
3309static void make_encode_exception(PyObject **exceptionObject,
3310 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003311 const Py_UNICODE *unicode, Py_ssize_t size,
3312 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003313 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003315 if (*exceptionObject == NULL) {
3316 *exceptionObject = PyUnicodeEncodeError_Create(
3317 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 }
3319 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003320 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3321 goto onError;
3322 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3323 goto onError;
3324 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3325 goto onError;
3326 return;
3327 onError:
3328 Py_DECREF(*exceptionObject);
3329 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330 }
3331}
3332
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003333/* raises a UnicodeEncodeError */
3334static void raise_encode_exception(PyObject **exceptionObject,
3335 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003336 const Py_UNICODE *unicode, Py_ssize_t size,
3337 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003338 const char *reason)
3339{
3340 make_encode_exception(exceptionObject,
3341 encoding, unicode, size, startpos, endpos, reason);
3342 if (*exceptionObject != NULL)
3343 PyCodec_StrictErrors(*exceptionObject);
3344}
3345
3346/* error handling callback helper:
3347 build arguments, call the callback and check the arguments,
3348 put the result into newpos and return the replacement string, which
3349 has to be freed by the caller */
3350static PyObject *unicode_encode_call_errorhandler(const char *errors,
3351 PyObject **errorHandler,
3352 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003353 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3354 Py_ssize_t startpos, Py_ssize_t endpos,
3355 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003356{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003357 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003358
3359 PyObject *restuple;
3360 PyObject *resunicode;
3361
3362 if (*errorHandler == NULL) {
3363 *errorHandler = PyCodec_LookupError(errors);
3364 if (*errorHandler == NULL)
3365 return NULL;
3366 }
3367
3368 make_encode_exception(exceptionObject,
3369 encoding, unicode, size, startpos, endpos, reason);
3370 if (*exceptionObject == NULL)
3371 return NULL;
3372
3373 restuple = PyObject_CallFunctionObjArgs(
3374 *errorHandler, *exceptionObject, NULL);
3375 if (restuple == NULL)
3376 return NULL;
3377 if (!PyTuple_Check(restuple)) {
3378 PyErr_Format(PyExc_TypeError, &argparse[4]);
3379 Py_DECREF(restuple);
3380 return NULL;
3381 }
3382 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3383 &resunicode, newpos)) {
3384 Py_DECREF(restuple);
3385 return NULL;
3386 }
3387 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003388 *newpos = size+*newpos;
3389 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003390 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003391 Py_DECREF(restuple);
3392 return NULL;
3393 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003394 Py_INCREF(resunicode);
3395 Py_DECREF(restuple);
3396 return resunicode;
3397}
3398
3399static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003400 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003401 const char *errors,
3402 int limit)
3403{
3404 /* output object */
3405 PyObject *res;
3406 /* pointers to the beginning and end+1 of input */
3407 const Py_UNICODE *startp = p;
3408 const Py_UNICODE *endp = p + size;
3409 /* pointer to the beginning of the unencodable characters */
3410 /* const Py_UNICODE *badp = NULL; */
3411 /* pointer into the output */
3412 char *str;
3413 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003414 Py_ssize_t respos = 0;
3415 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003416 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3417 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003418 PyObject *errorHandler = NULL;
3419 PyObject *exc = NULL;
3420 /* the following variable is used for caching string comparisons
3421 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3422 int known_errorHandler = -1;
3423
3424 /* allocate enough for a simple encoding without
3425 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003426 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003427 if (res == NULL)
3428 goto onError;
3429 if (size == 0)
3430 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003431 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003432 ressize = size;
3433
3434 while (p<endp) {
3435 Py_UNICODE c = *p;
3436
3437 /* can we encode this? */
3438 if (c<limit) {
3439 /* no overflow check, because we know that the space is enough */
3440 *str++ = (char)c;
3441 ++p;
3442 }
3443 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003444 Py_ssize_t unicodepos = p-startp;
3445 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003446 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003447 Py_ssize_t repsize;
3448 Py_ssize_t newpos;
3449 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003450 Py_UNICODE *uni2;
3451 /* startpos for collecting unencodable chars */
3452 const Py_UNICODE *collstart = p;
3453 const Py_UNICODE *collend = p;
3454 /* find all unecodable characters */
3455 while ((collend < endp) && ((*collend)>=limit))
3456 ++collend;
3457 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3458 if (known_errorHandler==-1) {
3459 if ((errors==NULL) || (!strcmp(errors, "strict")))
3460 known_errorHandler = 1;
3461 else if (!strcmp(errors, "replace"))
3462 known_errorHandler = 2;
3463 else if (!strcmp(errors, "ignore"))
3464 known_errorHandler = 3;
3465 else if (!strcmp(errors, "xmlcharrefreplace"))
3466 known_errorHandler = 4;
3467 else
3468 known_errorHandler = 0;
3469 }
3470 switch (known_errorHandler) {
3471 case 1: /* strict */
3472 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3473 goto onError;
3474 case 2: /* replace */
3475 while (collstart++<collend)
3476 *str++ = '?'; /* fall through */
3477 case 3: /* ignore */
3478 p = collend;
3479 break;
3480 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003481 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003482 /* determine replacement size (temporarily (mis)uses p) */
3483 for (p = collstart, repsize = 0; p < collend; ++p) {
3484 if (*p<10)
3485 repsize += 2+1+1;
3486 else if (*p<100)
3487 repsize += 2+2+1;
3488 else if (*p<1000)
3489 repsize += 2+3+1;
3490 else if (*p<10000)
3491 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003492#ifndef Py_UNICODE_WIDE
3493 else
3494 repsize += 2+5+1;
3495#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496 else if (*p<100000)
3497 repsize += 2+5+1;
3498 else if (*p<1000000)
3499 repsize += 2+6+1;
3500 else
3501 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003502#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503 }
3504 requiredsize = respos+repsize+(endp-collend);
3505 if (requiredsize > ressize) {
3506 if (requiredsize<2*ressize)
3507 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003508 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003510 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003511 ressize = requiredsize;
3512 }
3513 /* generate replacement (temporarily (mis)uses p) */
3514 for (p = collstart; p < collend; ++p) {
3515 str += sprintf(str, "&#%d;", (int)*p);
3516 }
3517 p = collend;
3518 break;
3519 default:
3520 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3521 encoding, reason, startp, size, &exc,
3522 collstart-startp, collend-startp, &newpos);
3523 if (repunicode == NULL)
3524 goto onError;
3525 /* need more space? (at least enough for what we
3526 have+the replacement+the rest of the string, so
3527 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003528 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 repsize = PyUnicode_GET_SIZE(repunicode);
3530 requiredsize = respos+repsize+(endp-collend);
3531 if (requiredsize > ressize) {
3532 if (requiredsize<2*ressize)
3533 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003534 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 Py_DECREF(repunicode);
3536 goto onError;
3537 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003538 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 ressize = requiredsize;
3540 }
3541 /* check if there is anything unencodable in the replacement
3542 and copy it to the output */
3543 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3544 c = *uni2;
3545 if (c >= limit) {
3546 raise_encode_exception(&exc, encoding, startp, size,
3547 unicodepos, unicodepos+1, reason);
3548 Py_DECREF(repunicode);
3549 goto onError;
3550 }
3551 *str = (char)c;
3552 }
3553 p = startp + newpos;
3554 Py_DECREF(repunicode);
3555 }
3556 }
3557 }
3558 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003559 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 if (respos<ressize)
3561 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003562 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563 Py_XDECREF(errorHandler);
3564 Py_XDECREF(exc);
3565 return res;
3566
3567 onError:
3568 Py_XDECREF(res);
3569 Py_XDECREF(errorHandler);
3570 Py_XDECREF(exc);
3571 return NULL;
3572}
3573
Guido van Rossumd57fd912000-03-10 22:53:23 +00003574PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003575 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003576 const char *errors)
3577{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579}
3580
3581PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3582{
3583 if (!PyUnicode_Check(unicode)) {
3584 PyErr_BadArgument();
3585 return NULL;
3586 }
3587 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3588 PyUnicode_GET_SIZE(unicode),
3589 NULL);
3590}
3591
3592/* --- 7-bit ASCII Codec -------------------------------------------------- */
3593
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003595 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596 const char *errors)
3597{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599 PyUnicodeObject *v;
3600 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003601 Py_ssize_t startinpos;
3602 Py_ssize_t endinpos;
3603 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003604 const char *e;
3605 PyObject *errorHandler = NULL;
3606 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003607
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003609 if (size == 1 && *(unsigned char*)s < 128) {
3610 Py_UNICODE r = *(unsigned char*)s;
3611 return PyUnicode_FromUnicode(&r, 1);
3612 }
Tim Petersced69f82003-09-16 20:30:58 +00003613
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614 v = _PyUnicode_New(size);
3615 if (v == NULL)
3616 goto onError;
3617 if (size == 0)
3618 return (PyObject *)v;
3619 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 e = s + size;
3621 while (s < e) {
3622 register unsigned char c = (unsigned char)*s;
3623 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003624 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003625 ++s;
3626 }
3627 else {
3628 startinpos = s-starts;
3629 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003630 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003631 if (unicode_decode_call_errorhandler(
3632 errors, &errorHandler,
3633 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003634 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003639 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003640 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003641 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003642 Py_XDECREF(errorHandler);
3643 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003645
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 onError:
3647 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003648 Py_XDECREF(errorHandler);
3649 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650 return NULL;
3651}
3652
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003654 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655 const char *errors)
3656{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658}
3659
3660PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3661{
3662 if (!PyUnicode_Check(unicode)) {
3663 PyErr_BadArgument();
3664 return NULL;
3665 }
3666 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3667 PyUnicode_GET_SIZE(unicode),
3668 NULL);
3669}
3670
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003671#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003672
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003673/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003674
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003675#if SIZEOF_INT < SIZEOF_SSIZE_T
3676#define NEED_RETRY
3677#endif
3678
3679/* XXX This code is limited to "true" double-byte encodings, as
3680 a) it assumes an incomplete character consists of a single byte, and
3681 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3682 encodings, see IsDBCSLeadByteEx documentation. */
3683
3684static int is_dbcs_lead_byte(const char *s, int offset)
3685{
3686 const char *curr = s + offset;
3687
3688 if (IsDBCSLeadByte(*curr)) {
3689 const char *prev = CharPrev(s, curr);
3690 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3691 }
3692 return 0;
3693}
3694
3695/*
3696 * Decode MBCS string into unicode object. If 'final' is set, converts
3697 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3698 */
3699static int decode_mbcs(PyUnicodeObject **v,
3700 const char *s, /* MBCS string */
3701 int size, /* sizeof MBCS string */
3702 int final)
3703{
3704 Py_UNICODE *p;
3705 Py_ssize_t n = 0;
3706 int usize = 0;
3707
3708 assert(size >= 0);
3709
3710 /* Skip trailing lead-byte unless 'final' is set */
3711 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3712 --size;
3713
3714 /* First get the size of the result */
3715 if (size > 0) {
3716 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3717 if (usize == 0) {
3718 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3719 return -1;
3720 }
3721 }
3722
3723 if (*v == NULL) {
3724 /* Create unicode object */
3725 *v = _PyUnicode_New(usize);
3726 if (*v == NULL)
3727 return -1;
3728 }
3729 else {
3730 /* Extend unicode object */
3731 n = PyUnicode_GET_SIZE(*v);
3732 if (_PyUnicode_Resize(v, n + usize) < 0)
3733 return -1;
3734 }
3735
3736 /* Do the conversion */
3737 if (size > 0) {
3738 p = PyUnicode_AS_UNICODE(*v) + n;
3739 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3740 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3741 return -1;
3742 }
3743 }
3744
3745 return size;
3746}
3747
3748PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3749 Py_ssize_t size,
3750 const char *errors,
3751 Py_ssize_t *consumed)
3752{
3753 PyUnicodeObject *v = NULL;
3754 int done;
3755
3756 if (consumed)
3757 *consumed = 0;
3758
3759#ifdef NEED_RETRY
3760 retry:
3761 if (size > INT_MAX)
3762 done = decode_mbcs(&v, s, INT_MAX, 0);
3763 else
3764#endif
3765 done = decode_mbcs(&v, s, (int)size, !consumed);
3766
3767 if (done < 0) {
3768 Py_XDECREF(v);
3769 return NULL;
3770 }
3771
3772 if (consumed)
3773 *consumed += done;
3774
3775#ifdef NEED_RETRY
3776 if (size > INT_MAX) {
3777 s += done;
3778 size -= done;
3779 goto retry;
3780 }
3781#endif
3782
3783 return (PyObject *)v;
3784}
3785
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003786PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003787 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003788 const char *errors)
3789{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003790 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3791}
3792
3793/*
3794 * Convert unicode into string object (MBCS).
3795 * Returns 0 if succeed, -1 otherwise.
3796 */
3797static int encode_mbcs(PyObject **repr,
3798 const Py_UNICODE *p, /* unicode */
3799 int size) /* size of unicode */
3800{
3801 int mbcssize = 0;
3802 Py_ssize_t n = 0;
3803
3804 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003805
3806 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003807 if (size > 0) {
3808 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3809 if (mbcssize == 0) {
3810 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3811 return -1;
3812 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003813 }
3814
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003815 if (*repr == NULL) {
3816 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003817 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003818 if (*repr == NULL)
3819 return -1;
3820 }
3821 else {
3822 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003823 n = PyBytes_Size(*repr);
3824 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003825 return -1;
3826 }
3827
3828 /* Do the conversion */
3829 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003830 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003831 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3832 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3833 return -1;
3834 }
3835 }
3836
3837 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003838}
3839
3840PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003841 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003842 const char *errors)
3843{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003844 PyObject *repr = NULL;
3845 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003846
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003847#ifdef NEED_RETRY
3848 retry:
3849 if (size > INT_MAX)
3850 ret = encode_mbcs(&repr, p, INT_MAX);
3851 else
3852#endif
3853 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003854
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003855 if (ret < 0) {
3856 Py_XDECREF(repr);
3857 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003858 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003859
3860#ifdef NEED_RETRY
3861 if (size > INT_MAX) {
3862 p += INT_MAX;
3863 size -= INT_MAX;
3864 goto retry;
3865 }
3866#endif
3867
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003868 return repr;
3869}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003870
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003871PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3872{
3873 if (!PyUnicode_Check(unicode)) {
3874 PyErr_BadArgument();
3875 return NULL;
3876 }
3877 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3878 PyUnicode_GET_SIZE(unicode),
3879 NULL);
3880}
3881
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003882#undef NEED_RETRY
3883
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003884#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003885
Guido van Rossumd57fd912000-03-10 22:53:23 +00003886/* --- Character Mapping Codec -------------------------------------------- */
3887
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003889 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890 PyObject *mapping,
3891 const char *errors)
3892{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003893 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003894 Py_ssize_t startinpos;
3895 Py_ssize_t endinpos;
3896 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003897 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003898 PyUnicodeObject *v;
3899 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003900 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003901 PyObject *errorHandler = NULL;
3902 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003903 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003904 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003905
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906 /* Default to Latin-1 */
3907 if (mapping == NULL)
3908 return PyUnicode_DecodeLatin1(s, size, errors);
3909
3910 v = _PyUnicode_New(size);
3911 if (v == NULL)
3912 goto onError;
3913 if (size == 0)
3914 return (PyObject *)v;
3915 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003916 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003917 if (PyUnicode_CheckExact(mapping)) {
3918 mapstring = PyUnicode_AS_UNICODE(mapping);
3919 maplen = PyUnicode_GET_SIZE(mapping);
3920 while (s < e) {
3921 unsigned char ch = *s;
3922 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003924 if (ch < maplen)
3925 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003926
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003927 if (x == 0xfffe) {
3928 /* undefined mapping */
3929 outpos = p-PyUnicode_AS_UNICODE(v);
3930 startinpos = s-starts;
3931 endinpos = startinpos+1;
3932 if (unicode_decode_call_errorhandler(
3933 errors, &errorHandler,
3934 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003935 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003936 (PyObject **)&v, &outpos, &p)) {
3937 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003938 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003939 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003940 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003941 *p++ = x;
3942 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003944 }
3945 else {
3946 while (s < e) {
3947 unsigned char ch = *s;
3948 PyObject *w, *x;
3949
3950 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3951 w = PyInt_FromLong((long)ch);
3952 if (w == NULL)
3953 goto onError;
3954 x = PyObject_GetItem(mapping, w);
3955 Py_DECREF(w);
3956 if (x == NULL) {
3957 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3958 /* No mapping found means: mapping is undefined. */
3959 PyErr_Clear();
3960 x = Py_None;
3961 Py_INCREF(x);
3962 } else
3963 goto onError;
3964 }
3965
3966 /* Apply mapping */
3967 if (PyInt_Check(x)) {
3968 long value = PyInt_AS_LONG(x);
3969 if (value < 0 || value > 65535) {
3970 PyErr_SetString(PyExc_TypeError,
3971 "character mapping must be in range(65536)");
3972 Py_DECREF(x);
3973 goto onError;
3974 }
3975 *p++ = (Py_UNICODE)value;
3976 }
3977 else if (x == Py_None) {
3978 /* undefined mapping */
3979 outpos = p-PyUnicode_AS_UNICODE(v);
3980 startinpos = s-starts;
3981 endinpos = startinpos+1;
3982 if (unicode_decode_call_errorhandler(
3983 errors, &errorHandler,
3984 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003985 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003986 (PyObject **)&v, &outpos, &p)) {
3987 Py_DECREF(x);
3988 goto onError;
3989 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003990 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003991 continue;
3992 }
3993 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003994 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003995
3996 if (targetsize == 1)
3997 /* 1-1 mapping */
3998 *p++ = *PyUnicode_AS_UNICODE(x);
3999
4000 else if (targetsize > 1) {
4001 /* 1-n mapping */
4002 if (targetsize > extrachars) {
4003 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004004 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4005 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004006 (targetsize << 2);
4007 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004008 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004009 if (_PyUnicode_Resize(&v,
4010 PyUnicode_GET_SIZE(v) + needed) < 0) {
4011 Py_DECREF(x);
4012 goto onError;
4013 }
4014 p = PyUnicode_AS_UNICODE(v) + oldpos;
4015 }
4016 Py_UNICODE_COPY(p,
4017 PyUnicode_AS_UNICODE(x),
4018 targetsize);
4019 p += targetsize;
4020 extrachars -= targetsize;
4021 }
4022 /* 1-0 mapping: skip the character */
4023 }
4024 else {
4025 /* wrong return value */
4026 PyErr_SetString(PyExc_TypeError,
4027 "character mapping must return integer, None or unicode");
4028 Py_DECREF(x);
4029 goto onError;
4030 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004032 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034 }
4035 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004036 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038 Py_XDECREF(errorHandler);
4039 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004040 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004041
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004043 Py_XDECREF(errorHandler);
4044 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004045 Py_XDECREF(v);
4046 return NULL;
4047}
4048
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004049/* Charmap encoding: the lookup table */
4050
4051struct encoding_map{
4052 PyObject_HEAD
4053 unsigned char level1[32];
4054 int count2, count3;
4055 unsigned char level23[1];
4056};
4057
4058static PyObject*
4059encoding_map_size(PyObject *obj, PyObject* args)
4060{
4061 struct encoding_map *map = (struct encoding_map*)obj;
4062 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4063 128*map->count3);
4064}
4065
4066static PyMethodDef encoding_map_methods[] = {
4067 {"size", encoding_map_size, METH_NOARGS,
4068 PyDoc_STR("Return the size (in bytes) of this object") },
4069 { 0 }
4070};
4071
4072static void
4073encoding_map_dealloc(PyObject* o)
4074{
4075 PyObject_FREE(o);
4076}
4077
4078static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004079 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004080 "EncodingMap", /*tp_name*/
4081 sizeof(struct encoding_map), /*tp_basicsize*/
4082 0, /*tp_itemsize*/
4083 /* methods */
4084 encoding_map_dealloc, /*tp_dealloc*/
4085 0, /*tp_print*/
4086 0, /*tp_getattr*/
4087 0, /*tp_setattr*/
4088 0, /*tp_compare*/
4089 0, /*tp_repr*/
4090 0, /*tp_as_number*/
4091 0, /*tp_as_sequence*/
4092 0, /*tp_as_mapping*/
4093 0, /*tp_hash*/
4094 0, /*tp_call*/
4095 0, /*tp_str*/
4096 0, /*tp_getattro*/
4097 0, /*tp_setattro*/
4098 0, /*tp_as_buffer*/
4099 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4100 0, /*tp_doc*/
4101 0, /*tp_traverse*/
4102 0, /*tp_clear*/
4103 0, /*tp_richcompare*/
4104 0, /*tp_weaklistoffset*/
4105 0, /*tp_iter*/
4106 0, /*tp_iternext*/
4107 encoding_map_methods, /*tp_methods*/
4108 0, /*tp_members*/
4109 0, /*tp_getset*/
4110 0, /*tp_base*/
4111 0, /*tp_dict*/
4112 0, /*tp_descr_get*/
4113 0, /*tp_descr_set*/
4114 0, /*tp_dictoffset*/
4115 0, /*tp_init*/
4116 0, /*tp_alloc*/
4117 0, /*tp_new*/
4118 0, /*tp_free*/
4119 0, /*tp_is_gc*/
4120};
4121
4122PyObject*
4123PyUnicode_BuildEncodingMap(PyObject* string)
4124{
4125 Py_UNICODE *decode;
4126 PyObject *result;
4127 struct encoding_map *mresult;
4128 int i;
4129 int need_dict = 0;
4130 unsigned char level1[32];
4131 unsigned char level2[512];
4132 unsigned char *mlevel1, *mlevel2, *mlevel3;
4133 int count2 = 0, count3 = 0;
4134
4135 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4136 PyErr_BadArgument();
4137 return NULL;
4138 }
4139 decode = PyUnicode_AS_UNICODE(string);
4140 memset(level1, 0xFF, sizeof level1);
4141 memset(level2, 0xFF, sizeof level2);
4142
4143 /* If there isn't a one-to-one mapping of NULL to \0,
4144 or if there are non-BMP characters, we need to use
4145 a mapping dictionary. */
4146 if (decode[0] != 0)
4147 need_dict = 1;
4148 for (i = 1; i < 256; i++) {
4149 int l1, l2;
4150 if (decode[i] == 0
4151 #ifdef Py_UNICODE_WIDE
4152 || decode[i] > 0xFFFF
4153 #endif
4154 ) {
4155 need_dict = 1;
4156 break;
4157 }
4158 if (decode[i] == 0xFFFE)
4159 /* unmapped character */
4160 continue;
4161 l1 = decode[i] >> 11;
4162 l2 = decode[i] >> 7;
4163 if (level1[l1] == 0xFF)
4164 level1[l1] = count2++;
4165 if (level2[l2] == 0xFF)
4166 level2[l2] = count3++;
4167 }
4168
4169 if (count2 >= 0xFF || count3 >= 0xFF)
4170 need_dict = 1;
4171
4172 if (need_dict) {
4173 PyObject *result = PyDict_New();
4174 PyObject *key, *value;
4175 if (!result)
4176 return NULL;
4177 for (i = 0; i < 256; i++) {
4178 key = value = NULL;
4179 key = PyInt_FromLong(decode[i]);
4180 value = PyInt_FromLong(i);
4181 if (!key || !value)
4182 goto failed1;
4183 if (PyDict_SetItem(result, key, value) == -1)
4184 goto failed1;
4185 Py_DECREF(key);
4186 Py_DECREF(value);
4187 }
4188 return result;
4189 failed1:
4190 Py_XDECREF(key);
4191 Py_XDECREF(value);
4192 Py_DECREF(result);
4193 return NULL;
4194 }
4195
4196 /* Create a three-level trie */
4197 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4198 16*count2 + 128*count3 - 1);
4199 if (!result)
4200 return PyErr_NoMemory();
4201 PyObject_Init(result, &EncodingMapType);
4202 mresult = (struct encoding_map*)result;
4203 mresult->count2 = count2;
4204 mresult->count3 = count3;
4205 mlevel1 = mresult->level1;
4206 mlevel2 = mresult->level23;
4207 mlevel3 = mresult->level23 + 16*count2;
4208 memcpy(mlevel1, level1, 32);
4209 memset(mlevel2, 0xFF, 16*count2);
4210 memset(mlevel3, 0, 128*count3);
4211 count3 = 0;
4212 for (i = 1; i < 256; i++) {
4213 int o1, o2, o3, i2, i3;
4214 if (decode[i] == 0xFFFE)
4215 /* unmapped character */
4216 continue;
4217 o1 = decode[i]>>11;
4218 o2 = (decode[i]>>7) & 0xF;
4219 i2 = 16*mlevel1[o1] + o2;
4220 if (mlevel2[i2] == 0xFF)
4221 mlevel2[i2] = count3++;
4222 o3 = decode[i] & 0x7F;
4223 i3 = 128*mlevel2[i2] + o3;
4224 mlevel3[i3] = i;
4225 }
4226 return result;
4227}
4228
4229static int
4230encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4231{
4232 struct encoding_map *map = (struct encoding_map*)mapping;
4233 int l1 = c>>11;
4234 int l2 = (c>>7) & 0xF;
4235 int l3 = c & 0x7F;
4236 int i;
4237
4238#ifdef Py_UNICODE_WIDE
4239 if (c > 0xFFFF) {
4240 return -1;
4241 }
4242#endif
4243 if (c == 0)
4244 return 0;
4245 /* level 1*/
4246 i = map->level1[l1];
4247 if (i == 0xFF) {
4248 return -1;
4249 }
4250 /* level 2*/
4251 i = map->level23[16*i+l2];
4252 if (i == 0xFF) {
4253 return -1;
4254 }
4255 /* level 3 */
4256 i = map->level23[16*map->count2 + 128*i + l3];
4257 if (i == 0) {
4258 return -1;
4259 }
4260 return i;
4261}
4262
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263/* Lookup the character ch in the mapping. If the character
4264 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004265 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004266static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268 PyObject *w = PyInt_FromLong((long)c);
4269 PyObject *x;
4270
4271 if (w == NULL)
4272 return NULL;
4273 x = PyObject_GetItem(mapping, w);
4274 Py_DECREF(w);
4275 if (x == NULL) {
4276 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4277 /* No mapping found means: mapping is undefined. */
4278 PyErr_Clear();
4279 x = Py_None;
4280 Py_INCREF(x);
4281 return x;
4282 } else
4283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004285 else if (x == Py_None)
4286 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004287 else if (PyInt_Check(x)) {
4288 long value = PyInt_AS_LONG(x);
4289 if (value < 0 || value > 255) {
4290 PyErr_SetString(PyExc_TypeError,
4291 "character mapping must be in range(256)");
4292 Py_DECREF(x);
4293 return NULL;
4294 }
4295 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004296 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004297 else if (PyString_Check(x))
4298 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004300 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004301 PyErr_Format(PyExc_TypeError,
4302 "character mapping must return integer, None or str8, not %.400s",
4303 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004304 Py_DECREF(x);
4305 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306 }
4307}
4308
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004309static int
Walter Dörwald827b0552007-05-12 13:23:53 +00004310charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004311{
Walter Dörwald827b0552007-05-12 13:23:53 +00004312 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004313 /* exponentially overallocate to minimize reallocations */
4314 if (requiredsize < 2*outsize)
4315 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00004316 if (PyBytes_Resize(outobj, requiredsize)) {
4317 Py_DECREF(outobj);
4318 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004319 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004320 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004321}
4322
4323typedef enum charmapencode_result {
4324 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4325}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004326/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004327 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004328 space is available. Return a new reference to the object that
4329 was put in the output buffer, or Py_None, if the mapping was undefined
4330 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004331 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004332static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004333charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00004334 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004335{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004336 PyObject *rep;
4337 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00004338 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004339
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004340 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004341 int res = encoding_map_lookup(c, mapping);
4342 Py_ssize_t requiredsize = *outpos+1;
4343 if (res == -1)
4344 return enc_FAILED;
4345 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004346 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004347 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004348 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004349 outstart[(*outpos)++] = (char)res;
4350 return enc_SUCCESS;
4351 }
4352
4353 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004354 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004355 return enc_EXCEPTION;
4356 else if (rep==Py_None) {
4357 Py_DECREF(rep);
4358 return enc_FAILED;
4359 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004360 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004361 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004362 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004363 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004364 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004365 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004366 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004367 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004368 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4369 }
4370 else {
4371 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004372 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4373 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004374 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004375 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004376 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004377 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004379 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380 memcpy(outstart + *outpos, repchars, repsize);
4381 *outpos += repsize;
4382 }
4383 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004384 Py_DECREF(rep);
4385 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004386}
4387
4388/* handle an error in PyUnicode_EncodeCharmap
4389 Return 0 on success, -1 on error */
4390static
4391int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004392 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004394 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004395 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396{
4397 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004398 Py_ssize_t repsize;
4399 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 Py_UNICODE *uni2;
4401 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004402 Py_ssize_t collstartpos = *inpos;
4403 Py_ssize_t collendpos = *inpos+1;
4404 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405 char *encoding = "charmap";
4406 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004407 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 /* find all unencodable characters */
4410 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004411 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004412 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004413 int res = encoding_map_lookup(p[collendpos], mapping);
4414 if (res != -1)
4415 break;
4416 ++collendpos;
4417 continue;
4418 }
4419
4420 rep = charmapencode_lookup(p[collendpos], mapping);
4421 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004422 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004423 else if (rep!=Py_None) {
4424 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 break;
4426 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004427 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428 ++collendpos;
4429 }
4430 /* cache callback name lookup
4431 * (if not done yet, i.e. it's the first error) */
4432 if (*known_errorHandler==-1) {
4433 if ((errors==NULL) || (!strcmp(errors, "strict")))
4434 *known_errorHandler = 1;
4435 else if (!strcmp(errors, "replace"))
4436 *known_errorHandler = 2;
4437 else if (!strcmp(errors, "ignore"))
4438 *known_errorHandler = 3;
4439 else if (!strcmp(errors, "xmlcharrefreplace"))
4440 *known_errorHandler = 4;
4441 else
4442 *known_errorHandler = 0;
4443 }
4444 switch (*known_errorHandler) {
4445 case 1: /* strict */
4446 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4447 return -1;
4448 case 2: /* replace */
4449 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4450 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004451 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452 return -1;
4453 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004454 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4456 return -1;
4457 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458 }
4459 /* fall through */
4460 case 3: /* ignore */
4461 *inpos = collendpos;
4462 break;
4463 case 4: /* xmlcharrefreplace */
4464 /* generate replacement (temporarily (mis)uses p) */
4465 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4466 char buffer[2+29+1+1];
4467 char *cp;
4468 sprintf(buffer, "&#%d;", (int)p[collpos]);
4469 for (cp = buffer; *cp; ++cp) {
4470 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004471 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004473 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004474 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4475 return -1;
4476 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004477 }
4478 }
4479 *inpos = collendpos;
4480 break;
4481 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004482 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004483 encoding, reason, p, size, exceptionObject,
4484 collstartpos, collendpos, &newpos);
4485 if (repunicode == NULL)
4486 return -1;
4487 /* generate replacement */
4488 repsize = PyUnicode_GET_SIZE(repunicode);
4489 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4490 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004491 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004492 return -1;
4493 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004494 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004496 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4497 return -1;
4498 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004499 }
4500 *inpos = newpos;
4501 Py_DECREF(repunicode);
4502 }
4503 return 0;
4504}
4505
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004507 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508 PyObject *mapping,
4509 const char *errors)
4510{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 /* output object */
4512 PyObject *res = NULL;
4513 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004514 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004516 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004517 PyObject *errorHandler = NULL;
4518 PyObject *exc = NULL;
4519 /* the following variable is used for caching string comparisons
4520 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4521 * 3=ignore, 4=xmlcharrefreplace */
4522 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004523
4524 /* Default to Latin-1 */
4525 if (mapping == NULL)
4526 return PyUnicode_EncodeLatin1(p, size, errors);
4527
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 /* allocate enough for a simple encoding without
4529 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004530 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004531 if (res == NULL)
4532 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004533 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004534 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004535
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004536 while (inpos<size) {
4537 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004538 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004539 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004541 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 if (charmap_encoding_error(p, size, &inpos, mapping,
4543 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004544 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004545 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004546 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004547 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 else
4550 /* done with this character => adjust input position */
4551 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004552 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004553
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004555 if (respos<PyBytes_GET_SIZE(res)) {
4556 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557 goto onError;
4558 }
4559 Py_XDECREF(exc);
4560 Py_XDECREF(errorHandler);
4561 return res;
4562
4563 onError:
4564 Py_XDECREF(res);
4565 Py_XDECREF(exc);
4566 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567 return NULL;
4568}
4569
4570PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4571 PyObject *mapping)
4572{
4573 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4574 PyErr_BadArgument();
4575 return NULL;
4576 }
4577 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4578 PyUnicode_GET_SIZE(unicode),
4579 mapping,
4580 NULL);
4581}
4582
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583/* create or adjust a UnicodeTranslateError */
4584static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004585 const Py_UNICODE *unicode, Py_ssize_t size,
4586 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589 if (*exceptionObject == NULL) {
4590 *exceptionObject = PyUnicodeTranslateError_Create(
4591 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592 }
4593 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004594 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4595 goto onError;
4596 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4597 goto onError;
4598 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4599 goto onError;
4600 return;
4601 onError:
4602 Py_DECREF(*exceptionObject);
4603 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604 }
4605}
4606
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004607/* raises a UnicodeTranslateError */
4608static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004609 const Py_UNICODE *unicode, Py_ssize_t size,
4610 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004611 const char *reason)
4612{
4613 make_translate_exception(exceptionObject,
4614 unicode, size, startpos, endpos, reason);
4615 if (*exceptionObject != NULL)
4616 PyCodec_StrictErrors(*exceptionObject);
4617}
4618
4619/* error handling callback helper:
4620 build arguments, call the callback and check the arguments,
4621 put the result into newpos and return the replacement string, which
4622 has to be freed by the caller */
4623static PyObject *unicode_translate_call_errorhandler(const char *errors,
4624 PyObject **errorHandler,
4625 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004626 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4627 Py_ssize_t startpos, Py_ssize_t endpos,
4628 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004630 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004631
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004632 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004633 PyObject *restuple;
4634 PyObject *resunicode;
4635
4636 if (*errorHandler == NULL) {
4637 *errorHandler = PyCodec_LookupError(errors);
4638 if (*errorHandler == NULL)
4639 return NULL;
4640 }
4641
4642 make_translate_exception(exceptionObject,
4643 unicode, size, startpos, endpos, reason);
4644 if (*exceptionObject == NULL)
4645 return NULL;
4646
4647 restuple = PyObject_CallFunctionObjArgs(
4648 *errorHandler, *exceptionObject, NULL);
4649 if (restuple == NULL)
4650 return NULL;
4651 if (!PyTuple_Check(restuple)) {
4652 PyErr_Format(PyExc_TypeError, &argparse[4]);
4653 Py_DECREF(restuple);
4654 return NULL;
4655 }
4656 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004657 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004658 Py_DECREF(restuple);
4659 return NULL;
4660 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004661 if (i_newpos<0)
4662 *newpos = size+i_newpos;
4663 else
4664 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004665 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004666 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004667 Py_DECREF(restuple);
4668 return NULL;
4669 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004670 Py_INCREF(resunicode);
4671 Py_DECREF(restuple);
4672 return resunicode;
4673}
4674
4675/* Lookup the character ch in the mapping and put the result in result,
4676 which must be decrefed by the caller.
4677 Return 0 on success, -1 on error */
4678static
4679int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4680{
4681 PyObject *w = PyInt_FromLong((long)c);
4682 PyObject *x;
4683
4684 if (w == NULL)
4685 return -1;
4686 x = PyObject_GetItem(mapping, w);
4687 Py_DECREF(w);
4688 if (x == NULL) {
4689 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4690 /* No mapping found means: use 1:1 mapping. */
4691 PyErr_Clear();
4692 *result = NULL;
4693 return 0;
4694 } else
4695 return -1;
4696 }
4697 else if (x == Py_None) {
4698 *result = x;
4699 return 0;
4700 }
4701 else if (PyInt_Check(x)) {
4702 long value = PyInt_AS_LONG(x);
4703 long max = PyUnicode_GetMax();
4704 if (value < 0 || value > max) {
4705 PyErr_Format(PyExc_TypeError,
4706 "character mapping must be in range(0x%lx)", max+1);
4707 Py_DECREF(x);
4708 return -1;
4709 }
4710 *result = x;
4711 return 0;
4712 }
4713 else if (PyUnicode_Check(x)) {
4714 *result = x;
4715 return 0;
4716 }
4717 else {
4718 /* wrong return value */
4719 PyErr_SetString(PyExc_TypeError,
4720 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004721 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004722 return -1;
4723 }
4724}
4725/* ensure that *outobj is at least requiredsize characters long,
4726if not reallocate and adjust various state variables.
4727Return 0 on success, -1 on error */
4728static
Walter Dörwald4894c302003-10-24 14:25:28 +00004729int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004730 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004731{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004732 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004733 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004734 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004735 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004736 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004737 if (requiredsize < 2 * oldsize)
4738 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004739 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004740 return -1;
4741 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004742 }
4743 return 0;
4744}
4745/* lookup the character, put the result in the output string and adjust
4746 various state variables. Return a new reference to the object that
4747 was put in the output buffer in *result, or Py_None, if the mapping was
4748 undefined (in which case no character was written).
4749 The called must decref result.
4750 Return 0 on success, -1 on error. */
4751static
Walter Dörwald4894c302003-10-24 14:25:28 +00004752int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004753 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004754 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004755{
Walter Dörwald4894c302003-10-24 14:25:28 +00004756 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004757 return -1;
4758 if (*res==NULL) {
4759 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004760 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004761 }
4762 else if (*res==Py_None)
4763 ;
4764 else if (PyInt_Check(*res)) {
4765 /* no overflow check, because we know that the space is enough */
4766 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4767 }
4768 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004769 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004770 if (repsize==1) {
4771 /* no overflow check, because we know that the space is enough */
4772 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4773 }
4774 else if (repsize!=0) {
4775 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004776 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004777 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004778 repsize - 1;
4779 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004780 return -1;
4781 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4782 *outp += repsize;
4783 }
4784 }
4785 else
4786 return -1;
4787 return 0;
4788}
4789
4790PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004791 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 PyObject *mapping,
4793 const char *errors)
4794{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795 /* output object */
4796 PyObject *res = NULL;
4797 /* pointers to the beginning and end+1 of input */
4798 const Py_UNICODE *startp = p;
4799 const Py_UNICODE *endp = p + size;
4800 /* pointer into the output */
4801 Py_UNICODE *str;
4802 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004803 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004804 char *reason = "character maps to <undefined>";
4805 PyObject *errorHandler = NULL;
4806 PyObject *exc = NULL;
4807 /* the following variable is used for caching string comparisons
4808 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4809 * 3=ignore, 4=xmlcharrefreplace */
4810 int known_errorHandler = -1;
4811
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812 if (mapping == NULL) {
4813 PyErr_BadArgument();
4814 return NULL;
4815 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816
4817 /* allocate enough for a simple 1:1 translation without
4818 replacements, if we need more, we'll resize */
4819 res = PyUnicode_FromUnicode(NULL, size);
4820 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004821 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 return res;
4824 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826 while (p<endp) {
4827 /* try to encode it */
4828 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004829 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004830 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 goto onError;
4832 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004833 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004834 if (x!=Py_None) /* it worked => adjust input pointer */
4835 ++p;
4836 else { /* untranslatable character */
4837 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004838 Py_ssize_t repsize;
4839 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004840 Py_UNICODE *uni2;
4841 /* startpos for collecting untranslatable chars */
4842 const Py_UNICODE *collstart = p;
4843 const Py_UNICODE *collend = p+1;
4844 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846 /* find all untranslatable characters */
4847 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004848 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004849 goto onError;
4850 Py_XDECREF(x);
4851 if (x!=Py_None)
4852 break;
4853 ++collend;
4854 }
4855 /* cache callback name lookup
4856 * (if not done yet, i.e. it's the first error) */
4857 if (known_errorHandler==-1) {
4858 if ((errors==NULL) || (!strcmp(errors, "strict")))
4859 known_errorHandler = 1;
4860 else if (!strcmp(errors, "replace"))
4861 known_errorHandler = 2;
4862 else if (!strcmp(errors, "ignore"))
4863 known_errorHandler = 3;
4864 else if (!strcmp(errors, "xmlcharrefreplace"))
4865 known_errorHandler = 4;
4866 else
4867 known_errorHandler = 0;
4868 }
4869 switch (known_errorHandler) {
4870 case 1: /* strict */
4871 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4872 goto onError;
4873 case 2: /* replace */
4874 /* No need to check for space, this is a 1:1 replacement */
4875 for (coll = collstart; coll<collend; ++coll)
4876 *str++ = '?';
4877 /* fall through */
4878 case 3: /* ignore */
4879 p = collend;
4880 break;
4881 case 4: /* xmlcharrefreplace */
4882 /* generate replacement (temporarily (mis)uses p) */
4883 for (p = collstart; p < collend; ++p) {
4884 char buffer[2+29+1+1];
4885 char *cp;
4886 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004887 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004888 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4889 goto onError;
4890 for (cp = buffer; *cp; ++cp)
4891 *str++ = *cp;
4892 }
4893 p = collend;
4894 break;
4895 default:
4896 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4897 reason, startp, size, &exc,
4898 collstart-startp, collend-startp, &newpos);
4899 if (repunicode == NULL)
4900 goto onError;
4901 /* generate replacement */
4902 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004903 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004904 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4905 Py_DECREF(repunicode);
4906 goto onError;
4907 }
4908 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4909 *str++ = *uni2;
4910 p = startp + newpos;
4911 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912 }
4913 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004915 /* Resize if we allocated to much */
4916 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004917 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004918 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004919 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004920 }
4921 Py_XDECREF(exc);
4922 Py_XDECREF(errorHandler);
4923 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004925 onError:
4926 Py_XDECREF(res);
4927 Py_XDECREF(exc);
4928 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929 return NULL;
4930}
4931
4932PyObject *PyUnicode_Translate(PyObject *str,
4933 PyObject *mapping,
4934 const char *errors)
4935{
4936 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004937
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 str = PyUnicode_FromObject(str);
4939 if (str == NULL)
4940 goto onError;
4941 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4942 PyUnicode_GET_SIZE(str),
4943 mapping,
4944 errors);
4945 Py_DECREF(str);
4946 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004947
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948 onError:
4949 Py_XDECREF(str);
4950 return NULL;
4951}
Tim Petersced69f82003-09-16 20:30:58 +00004952
Guido van Rossum9e896b32000-04-05 20:11:21 +00004953/* --- Decimal Encoder ---------------------------------------------------- */
4954
4955int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004956 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004957 char *output,
4958 const char *errors)
4959{
4960 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004961 PyObject *errorHandler = NULL;
4962 PyObject *exc = NULL;
4963 const char *encoding = "decimal";
4964 const char *reason = "invalid decimal Unicode string";
4965 /* the following variable is used for caching string comparisons
4966 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4967 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004968
4969 if (output == NULL) {
4970 PyErr_BadArgument();
4971 return -1;
4972 }
4973
4974 p = s;
4975 end = s + length;
4976 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004977 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004978 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004979 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004980 Py_ssize_t repsize;
4981 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004982 Py_UNICODE *uni2;
4983 Py_UNICODE *collstart;
4984 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004985
Guido van Rossum9e896b32000-04-05 20:11:21 +00004986 if (Py_UNICODE_ISSPACE(ch)) {
4987 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004988 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004989 continue;
4990 }
4991 decimal = Py_UNICODE_TODECIMAL(ch);
4992 if (decimal >= 0) {
4993 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004994 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004995 continue;
4996 }
Guido van Rossumba477042000-04-06 18:18:10 +00004997 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004998 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004999 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005000 continue;
5001 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005002 /* All other characters are considered unencodable */
5003 collstart = p;
5004 collend = p+1;
5005 while (collend < end) {
5006 if ((0 < *collend && *collend < 256) ||
5007 !Py_UNICODE_ISSPACE(*collend) ||
5008 Py_UNICODE_TODECIMAL(*collend))
5009 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005010 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005011 /* cache callback name lookup
5012 * (if not done yet, i.e. it's the first error) */
5013 if (known_errorHandler==-1) {
5014 if ((errors==NULL) || (!strcmp(errors, "strict")))
5015 known_errorHandler = 1;
5016 else if (!strcmp(errors, "replace"))
5017 known_errorHandler = 2;
5018 else if (!strcmp(errors, "ignore"))
5019 known_errorHandler = 3;
5020 else if (!strcmp(errors, "xmlcharrefreplace"))
5021 known_errorHandler = 4;
5022 else
5023 known_errorHandler = 0;
5024 }
5025 switch (known_errorHandler) {
5026 case 1: /* strict */
5027 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5028 goto onError;
5029 case 2: /* replace */
5030 for (p = collstart; p < collend; ++p)
5031 *output++ = '?';
5032 /* fall through */
5033 case 3: /* ignore */
5034 p = collend;
5035 break;
5036 case 4: /* xmlcharrefreplace */
5037 /* generate replacement (temporarily (mis)uses p) */
5038 for (p = collstart; p < collend; ++p)
5039 output += sprintf(output, "&#%d;", (int)*p);
5040 p = collend;
5041 break;
5042 default:
5043 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5044 encoding, reason, s, length, &exc,
5045 collstart-s, collend-s, &newpos);
5046 if (repunicode == NULL)
5047 goto onError;
5048 /* generate replacement */
5049 repsize = PyUnicode_GET_SIZE(repunicode);
5050 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5051 Py_UNICODE ch = *uni2;
5052 if (Py_UNICODE_ISSPACE(ch))
5053 *output++ = ' ';
5054 else {
5055 decimal = Py_UNICODE_TODECIMAL(ch);
5056 if (decimal >= 0)
5057 *output++ = '0' + decimal;
5058 else if (0 < ch && ch < 256)
5059 *output++ = (char)ch;
5060 else {
5061 Py_DECREF(repunicode);
5062 raise_encode_exception(&exc, encoding,
5063 s, length, collstart-s, collend-s, reason);
5064 goto onError;
5065 }
5066 }
5067 }
5068 p = s + newpos;
5069 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005070 }
5071 }
5072 /* 0-terminate the output string */
5073 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005074 Py_XDECREF(exc);
5075 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005076 return 0;
5077
5078 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005079 Py_XDECREF(exc);
5080 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005081 return -1;
5082}
5083
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084/* --- Helpers ------------------------------------------------------------ */
5085
Eric Smith8c663262007-08-25 02:26:07 +00005086#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005087
5088#include "stringlib/fastsearch.h"
5089
5090#include "stringlib/count.h"
5091#include "stringlib/find.h"
5092#include "stringlib/partition.h"
5093
5094/* helper macro to fixup start/end slice values */
5095#define FIX_START_END(obj) \
5096 if (start < 0) \
5097 start += (obj)->length; \
5098 if (start < 0) \
5099 start = 0; \
5100 if (end > (obj)->length) \
5101 end = (obj)->length; \
5102 if (end < 0) \
5103 end += (obj)->length; \
5104 if (end < 0) \
5105 end = 0;
5106
Martin v. Löwis18e16552006-02-15 17:27:45 +00005107Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005108 PyObject *substr,
5109 Py_ssize_t start,
5110 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005112 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005113 PyUnicodeObject* str_obj;
5114 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005115
Thomas Wouters477c8d52006-05-27 19:21:47 +00005116 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5117 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005119 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5120 if (!sub_obj) {
5121 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122 return -1;
5123 }
Tim Petersced69f82003-09-16 20:30:58 +00005124
Thomas Wouters477c8d52006-05-27 19:21:47 +00005125 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005126
Thomas Wouters477c8d52006-05-27 19:21:47 +00005127 result = stringlib_count(
5128 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5129 );
5130
5131 Py_DECREF(sub_obj);
5132 Py_DECREF(str_obj);
5133
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134 return result;
5135}
5136
Martin v. Löwis18e16552006-02-15 17:27:45 +00005137Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005138 PyObject *sub,
5139 Py_ssize_t start,
5140 Py_ssize_t end,
5141 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005143 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005144
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005146 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005147 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005148 sub = PyUnicode_FromObject(sub);
5149 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005150 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005151 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 }
Tim Petersced69f82003-09-16 20:30:58 +00005153
Thomas Wouters477c8d52006-05-27 19:21:47 +00005154 if (direction > 0)
5155 result = stringlib_find_slice(
5156 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5157 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5158 start, end
5159 );
5160 else
5161 result = stringlib_rfind_slice(
5162 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5163 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5164 start, end
5165 );
5166
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005168 Py_DECREF(sub);
5169
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170 return result;
5171}
5172
Tim Petersced69f82003-09-16 20:30:58 +00005173static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174int tailmatch(PyUnicodeObject *self,
5175 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005176 Py_ssize_t start,
5177 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 int direction)
5179{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180 if (substring->length == 0)
5181 return 1;
5182
Thomas Wouters477c8d52006-05-27 19:21:47 +00005183 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184
5185 end -= substring->length;
5186 if (end < start)
5187 return 0;
5188
5189 if (direction > 0) {
5190 if (Py_UNICODE_MATCH(self, end, substring))
5191 return 1;
5192 } else {
5193 if (Py_UNICODE_MATCH(self, start, substring))
5194 return 1;
5195 }
5196
5197 return 0;
5198}
5199
Martin v. Löwis18e16552006-02-15 17:27:45 +00005200Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005202 Py_ssize_t start,
5203 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204 int direction)
5205{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005206 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005207
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 str = PyUnicode_FromObject(str);
5209 if (str == NULL)
5210 return -1;
5211 substr = PyUnicode_FromObject(substr);
5212 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005213 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214 return -1;
5215 }
Tim Petersced69f82003-09-16 20:30:58 +00005216
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217 result = tailmatch((PyUnicodeObject *)str,
5218 (PyUnicodeObject *)substr,
5219 start, end, direction);
5220 Py_DECREF(str);
5221 Py_DECREF(substr);
5222 return result;
5223}
5224
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225/* Apply fixfct filter to the Unicode object self and return a
5226 reference to the modified object */
5227
Tim Petersced69f82003-09-16 20:30:58 +00005228static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229PyObject *fixup(PyUnicodeObject *self,
5230 int (*fixfct)(PyUnicodeObject *s))
5231{
5232
5233 PyUnicodeObject *u;
5234
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005235 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 if (u == NULL)
5237 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005238
5239 Py_UNICODE_COPY(u->str, self->str, self->length);
5240
Tim Peters7a29bd52001-09-12 03:03:31 +00005241 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242 /* fixfct should return TRUE if it modified the buffer. If
5243 FALSE, return a reference to the original buffer instead
5244 (to save space, not time) */
5245 Py_INCREF(self);
5246 Py_DECREF(u);
5247 return (PyObject*) self;
5248 }
5249 return (PyObject*) u;
5250}
5251
Tim Petersced69f82003-09-16 20:30:58 +00005252static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253int fixupper(PyUnicodeObject *self)
5254{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005255 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256 Py_UNICODE *s = self->str;
5257 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005258
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259 while (len-- > 0) {
5260 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005261
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262 ch = Py_UNICODE_TOUPPER(*s);
5263 if (ch != *s) {
5264 status = 1;
5265 *s = ch;
5266 }
5267 s++;
5268 }
5269
5270 return status;
5271}
5272
Tim Petersced69f82003-09-16 20:30:58 +00005273static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274int fixlower(PyUnicodeObject *self)
5275{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005276 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277 Py_UNICODE *s = self->str;
5278 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005279
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280 while (len-- > 0) {
5281 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005282
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283 ch = Py_UNICODE_TOLOWER(*s);
5284 if (ch != *s) {
5285 status = 1;
5286 *s = ch;
5287 }
5288 s++;
5289 }
5290
5291 return status;
5292}
5293
Tim Petersced69f82003-09-16 20:30:58 +00005294static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295int fixswapcase(PyUnicodeObject *self)
5296{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005297 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 Py_UNICODE *s = self->str;
5299 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005300
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 while (len-- > 0) {
5302 if (Py_UNICODE_ISUPPER(*s)) {
5303 *s = Py_UNICODE_TOLOWER(*s);
5304 status = 1;
5305 } else if (Py_UNICODE_ISLOWER(*s)) {
5306 *s = Py_UNICODE_TOUPPER(*s);
5307 status = 1;
5308 }
5309 s++;
5310 }
5311
5312 return status;
5313}
5314
Tim Petersced69f82003-09-16 20:30:58 +00005315static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316int fixcapitalize(PyUnicodeObject *self)
5317{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005318 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005319 Py_UNICODE *s = self->str;
5320 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005321
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005322 if (len == 0)
5323 return 0;
5324 if (Py_UNICODE_ISLOWER(*s)) {
5325 *s = Py_UNICODE_TOUPPER(*s);
5326 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005328 s++;
5329 while (--len > 0) {
5330 if (Py_UNICODE_ISUPPER(*s)) {
5331 *s = Py_UNICODE_TOLOWER(*s);
5332 status = 1;
5333 }
5334 s++;
5335 }
5336 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337}
5338
5339static
5340int fixtitle(PyUnicodeObject *self)
5341{
5342 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5343 register Py_UNICODE *e;
5344 int previous_is_cased;
5345
5346 /* Shortcut for single character strings */
5347 if (PyUnicode_GET_SIZE(self) == 1) {
5348 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5349 if (*p != ch) {
5350 *p = ch;
5351 return 1;
5352 }
5353 else
5354 return 0;
5355 }
Tim Petersced69f82003-09-16 20:30:58 +00005356
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357 e = p + PyUnicode_GET_SIZE(self);
5358 previous_is_cased = 0;
5359 for (; p < e; p++) {
5360 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005361
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 if (previous_is_cased)
5363 *p = Py_UNICODE_TOLOWER(ch);
5364 else
5365 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005366
5367 if (Py_UNICODE_ISLOWER(ch) ||
5368 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 Py_UNICODE_ISTITLE(ch))
5370 previous_is_cased = 1;
5371 else
5372 previous_is_cased = 0;
5373 }
5374 return 1;
5375}
5376
Tim Peters8ce9f162004-08-27 01:49:32 +00005377PyObject *
5378PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379{
Tim Peters8ce9f162004-08-27 01:49:32 +00005380 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005381 const Py_UNICODE blank = ' ';
5382 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005383 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005384 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005385 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5386 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005387 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5388 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005389 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005390 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005391 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392
Tim Peters05eba1f2004-08-27 21:32:02 +00005393 fseq = PySequence_Fast(seq, "");
5394 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005395 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005396 }
5397
Tim Peters91879ab2004-08-27 22:35:44 +00005398 /* Grrrr. A codec may be invoked to convert str objects to
5399 * Unicode, and so it's possible to call back into Python code
5400 * during PyUnicode_FromObject(), and so it's possible for a sick
5401 * codec to change the size of fseq (if seq is a list). Therefore
5402 * we have to keep refetching the size -- can't assume seqlen
5403 * is invariant.
5404 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005405 seqlen = PySequence_Fast_GET_SIZE(fseq);
5406 /* If empty sequence, return u"". */
5407 if (seqlen == 0) {
5408 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5409 goto Done;
5410 }
5411 /* If singleton sequence with an exact Unicode, return that. */
5412 if (seqlen == 1) {
5413 item = PySequence_Fast_GET_ITEM(fseq, 0);
5414 if (PyUnicode_CheckExact(item)) {
5415 Py_INCREF(item);
5416 res = (PyUnicodeObject *)item;
5417 goto Done;
5418 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005419 }
5420
Tim Peters05eba1f2004-08-27 21:32:02 +00005421 /* At least two items to join, or one that isn't exact Unicode. */
5422 if (seqlen > 1) {
5423 /* Set up sep and seplen -- they're needed. */
5424 if (separator == NULL) {
5425 sep = &blank;
5426 seplen = 1;
5427 }
5428 else {
5429 internal_separator = PyUnicode_FromObject(separator);
5430 if (internal_separator == NULL)
5431 goto onError;
5432 sep = PyUnicode_AS_UNICODE(internal_separator);
5433 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005434 /* In case PyUnicode_FromObject() mutated seq. */
5435 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005436 }
5437 }
5438
5439 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005440 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005441 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005442 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005443 res_p = PyUnicode_AS_UNICODE(res);
5444 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005445
Tim Peters05eba1f2004-08-27 21:32:02 +00005446 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005447 Py_ssize_t itemlen;
5448 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005449
5450 item = PySequence_Fast_GET_ITEM(fseq, i);
5451 /* Convert item to Unicode. */
Guido van Rossumf1044292007-09-27 18:01:22 +00005452 if (!PyString_Check(item) && !PyUnicode_Check(item))
5453 {
5454 if (PyBytes_Check(item))
5455 {
5456 PyErr_Format(PyExc_TypeError,
5457 "sequence item %d: join() will not operate on "
5458 "bytes objects", i);
5459 goto onError;
5460 }
5461 item = PyObject_Unicode(item);
Tim Peters8ce9f162004-08-27 01:49:32 +00005462 }
Guido van Rossumf1044292007-09-27 18:01:22 +00005463 else
5464 item = PyUnicode_FromObject(item);
5465
Tim Peters05eba1f2004-08-27 21:32:02 +00005466 if (item == NULL)
5467 goto onError;
5468 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005469
Tim Peters91879ab2004-08-27 22:35:44 +00005470 /* In case PyUnicode_FromObject() mutated seq. */
5471 seqlen = PySequence_Fast_GET_SIZE(fseq);
5472
Tim Peters8ce9f162004-08-27 01:49:32 +00005473 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005475 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005476 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005477 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005478 if (i < seqlen - 1) {
5479 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005480 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005481 goto Overflow;
5482 }
5483 if (new_res_used > res_alloc) {
5484 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005485 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005486 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005487 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005488 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005489 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005490 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005491 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005493 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005494 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005496
5497 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005498 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005499 res_p += itemlen;
5500 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005501 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005502 res_p += seplen;
5503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005505 res_used = new_res_used;
5506 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005507
Tim Peters05eba1f2004-08-27 21:32:02 +00005508 /* Shrink res to match the used area; this probably can't fail,
5509 * but it's cheap to check.
5510 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005511 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005512 goto onError;
5513
5514 Done:
5515 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005516 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 return (PyObject *)res;
5518
Tim Peters8ce9f162004-08-27 01:49:32 +00005519 Overflow:
5520 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005521 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005522 Py_DECREF(item);
5523 /* fall through */
5524
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005526 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005527 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005528 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529 return NULL;
5530}
5531
Tim Petersced69f82003-09-16 20:30:58 +00005532static
5533PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005534 Py_ssize_t left,
5535 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536 Py_UNICODE fill)
5537{
5538 PyUnicodeObject *u;
5539
5540 if (left < 0)
5541 left = 0;
5542 if (right < 0)
5543 right = 0;
5544
Tim Peters7a29bd52001-09-12 03:03:31 +00005545 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546 Py_INCREF(self);
5547 return self;
5548 }
5549
5550 u = _PyUnicode_New(left + self->length + right);
5551 if (u) {
5552 if (left)
5553 Py_UNICODE_FILL(u->str, fill, left);
5554 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5555 if (right)
5556 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5557 }
5558
5559 return u;
5560}
5561
5562#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005563 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564 if (!str) \
5565 goto onError; \
5566 if (PyList_Append(list, str)) { \
5567 Py_DECREF(str); \
5568 goto onError; \
5569 } \
5570 else \
5571 Py_DECREF(str);
5572
5573static
5574PyObject *split_whitespace(PyUnicodeObject *self,
5575 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005576 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005578 register Py_ssize_t i;
5579 register Py_ssize_t j;
5580 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581 PyObject *str;
5582
5583 for (i = j = 0; i < len; ) {
5584 /* find a token */
5585 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5586 i++;
5587 j = i;
5588 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5589 i++;
5590 if (j < i) {
5591 if (maxcount-- <= 0)
5592 break;
5593 SPLIT_APPEND(self->str, j, i);
5594 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5595 i++;
5596 j = i;
5597 }
5598 }
5599 if (j < len) {
5600 SPLIT_APPEND(self->str, j, len);
5601 }
5602 return list;
5603
5604 onError:
5605 Py_DECREF(list);
5606 return NULL;
5607}
5608
5609PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005610 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005612 register Py_ssize_t i;
5613 register Py_ssize_t j;
5614 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 PyObject *list;
5616 PyObject *str;
5617 Py_UNICODE *data;
5618
5619 string = PyUnicode_FromObject(string);
5620 if (string == NULL)
5621 return NULL;
5622 data = PyUnicode_AS_UNICODE(string);
5623 len = PyUnicode_GET_SIZE(string);
5624
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 list = PyList_New(0);
5626 if (!list)
5627 goto onError;
5628
5629 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005630 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005631
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005633 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635
5636 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005637 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 if (i < len) {
5639 if (data[i] == '\r' && i + 1 < len &&
5640 data[i+1] == '\n')
5641 i += 2;
5642 else
5643 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005644 if (keepends)
5645 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 }
Guido van Rossum86662912000-04-11 15:38:46 +00005647 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 j = i;
5649 }
5650 if (j < len) {
5651 SPLIT_APPEND(data, j, len);
5652 }
5653
5654 Py_DECREF(string);
5655 return list;
5656
5657 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005658 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 Py_DECREF(string);
5660 return NULL;
5661}
5662
Tim Petersced69f82003-09-16 20:30:58 +00005663static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664PyObject *split_char(PyUnicodeObject *self,
5665 PyObject *list,
5666 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005667 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005669 register Py_ssize_t i;
5670 register Py_ssize_t j;
5671 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672 PyObject *str;
5673
5674 for (i = j = 0; i < len; ) {
5675 if (self->str[i] == ch) {
5676 if (maxcount-- <= 0)
5677 break;
5678 SPLIT_APPEND(self->str, j, i);
5679 i = j = i + 1;
5680 } else
5681 i++;
5682 }
5683 if (j <= len) {
5684 SPLIT_APPEND(self->str, j, len);
5685 }
5686 return list;
5687
5688 onError:
5689 Py_DECREF(list);
5690 return NULL;
5691}
5692
Tim Petersced69f82003-09-16 20:30:58 +00005693static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694PyObject *split_substring(PyUnicodeObject *self,
5695 PyObject *list,
5696 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005697 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005699 register Py_ssize_t i;
5700 register Py_ssize_t j;
5701 Py_ssize_t len = self->length;
5702 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 PyObject *str;
5704
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005705 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 if (Py_UNICODE_MATCH(self, i, substring)) {
5707 if (maxcount-- <= 0)
5708 break;
5709 SPLIT_APPEND(self->str, j, i);
5710 i = j = i + sublen;
5711 } else
5712 i++;
5713 }
5714 if (j <= len) {
5715 SPLIT_APPEND(self->str, j, len);
5716 }
5717 return list;
5718
5719 onError:
5720 Py_DECREF(list);
5721 return NULL;
5722}
5723
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005724static
5725PyObject *rsplit_whitespace(PyUnicodeObject *self,
5726 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005727 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005728{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005729 register Py_ssize_t i;
5730 register Py_ssize_t j;
5731 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005732 PyObject *str;
5733
5734 for (i = j = len - 1; i >= 0; ) {
5735 /* find a token */
5736 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5737 i--;
5738 j = i;
5739 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5740 i--;
5741 if (j > i) {
5742 if (maxcount-- <= 0)
5743 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005744 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005745 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5746 i--;
5747 j = i;
5748 }
5749 }
5750 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005751 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005752 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005753 if (PyList_Reverse(list) < 0)
5754 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005755 return list;
5756
5757 onError:
5758 Py_DECREF(list);
5759 return NULL;
5760}
5761
5762static
5763PyObject *rsplit_char(PyUnicodeObject *self,
5764 PyObject *list,
5765 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005766 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005767{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005768 register Py_ssize_t i;
5769 register Py_ssize_t j;
5770 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005771 PyObject *str;
5772
5773 for (i = j = len - 1; i >= 0; ) {
5774 if (self->str[i] == ch) {
5775 if (maxcount-- <= 0)
5776 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005777 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005778 j = i = i - 1;
5779 } else
5780 i--;
5781 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005782 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005783 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005784 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005785 if (PyList_Reverse(list) < 0)
5786 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005787 return list;
5788
5789 onError:
5790 Py_DECREF(list);
5791 return NULL;
5792}
5793
5794static
5795PyObject *rsplit_substring(PyUnicodeObject *self,
5796 PyObject *list,
5797 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005798 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005799{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005800 register Py_ssize_t i;
5801 register Py_ssize_t j;
5802 Py_ssize_t len = self->length;
5803 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005804 PyObject *str;
5805
5806 for (i = len - sublen, j = len; i >= 0; ) {
5807 if (Py_UNICODE_MATCH(self, i, substring)) {
5808 if (maxcount-- <= 0)
5809 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005810 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005811 j = i;
5812 i -= sublen;
5813 } else
5814 i--;
5815 }
5816 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005817 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005818 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005819 if (PyList_Reverse(list) < 0)
5820 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005821 return list;
5822
5823 onError:
5824 Py_DECREF(list);
5825 return NULL;
5826}
5827
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828#undef SPLIT_APPEND
5829
5830static
5831PyObject *split(PyUnicodeObject *self,
5832 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005833 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834{
5835 PyObject *list;
5836
5837 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005838 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839
5840 list = PyList_New(0);
5841 if (!list)
5842 return NULL;
5843
5844 if (substring == NULL)
5845 return split_whitespace(self,list,maxcount);
5846
5847 else if (substring->length == 1)
5848 return split_char(self,list,substring->str[0],maxcount);
5849
5850 else if (substring->length == 0) {
5851 Py_DECREF(list);
5852 PyErr_SetString(PyExc_ValueError, "empty separator");
5853 return NULL;
5854 }
5855 else
5856 return split_substring(self,list,substring,maxcount);
5857}
5858
Tim Petersced69f82003-09-16 20:30:58 +00005859static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005860PyObject *rsplit(PyUnicodeObject *self,
5861 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005862 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005863{
5864 PyObject *list;
5865
5866 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005867 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005868
5869 list = PyList_New(0);
5870 if (!list)
5871 return NULL;
5872
5873 if (substring == NULL)
5874 return rsplit_whitespace(self,list,maxcount);
5875
5876 else if (substring->length == 1)
5877 return rsplit_char(self,list,substring->str[0],maxcount);
5878
5879 else if (substring->length == 0) {
5880 Py_DECREF(list);
5881 PyErr_SetString(PyExc_ValueError, "empty separator");
5882 return NULL;
5883 }
5884 else
5885 return rsplit_substring(self,list,substring,maxcount);
5886}
5887
5888static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889PyObject *replace(PyUnicodeObject *self,
5890 PyUnicodeObject *str1,
5891 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005892 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893{
5894 PyUnicodeObject *u;
5895
5896 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005897 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898
Thomas Wouters477c8d52006-05-27 19:21:47 +00005899 if (str1->length == str2->length) {
5900 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005901 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005902 if (str1->length == 1) {
5903 /* replace characters */
5904 Py_UNICODE u1, u2;
5905 if (!findchar(self->str, self->length, str1->str[0]))
5906 goto nothing;
5907 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5908 if (!u)
5909 return NULL;
5910 Py_UNICODE_COPY(u->str, self->str, self->length);
5911 u1 = str1->str[0];
5912 u2 = str2->str[0];
5913 for (i = 0; i < u->length; i++)
5914 if (u->str[i] == u1) {
5915 if (--maxcount < 0)
5916 break;
5917 u->str[i] = u2;
5918 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005920 i = fastsearch(
5921 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005923 if (i < 0)
5924 goto nothing;
5925 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5926 if (!u)
5927 return NULL;
5928 Py_UNICODE_COPY(u->str, self->str, self->length);
5929 while (i <= self->length - str1->length)
5930 if (Py_UNICODE_MATCH(self, i, str1)) {
5931 if (--maxcount < 0)
5932 break;
5933 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5934 i += str1->length;
5935 } else
5936 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005939
5940 Py_ssize_t n, i, j, e;
5941 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 Py_UNICODE *p;
5943
5944 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005945 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 if (n > maxcount)
5947 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005948 if (n == 0)
5949 goto nothing;
5950 /* new_size = self->length + n * (str2->length - str1->length)); */
5951 delta = (str2->length - str1->length);
5952 if (delta == 0) {
5953 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005955 product = n * (str2->length - str1->length);
5956 if ((product / (str2->length - str1->length)) != n) {
5957 PyErr_SetString(PyExc_OverflowError,
5958 "replace string is too long");
5959 return NULL;
5960 }
5961 new_size = self->length + product;
5962 if (new_size < 0) {
5963 PyErr_SetString(PyExc_OverflowError,
5964 "replace string is too long");
5965 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 }
5967 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005968 u = _PyUnicode_New(new_size);
5969 if (!u)
5970 return NULL;
5971 i = 0;
5972 p = u->str;
5973 e = self->length - str1->length;
5974 if (str1->length > 0) {
5975 while (n-- > 0) {
5976 /* look for next match */
5977 j = i;
5978 while (j <= e) {
5979 if (Py_UNICODE_MATCH(self, j, str1))
5980 break;
5981 j++;
5982 }
5983 if (j > i) {
5984 if (j > e)
5985 break;
5986 /* copy unchanged part [i:j] */
5987 Py_UNICODE_COPY(p, self->str+i, j-i);
5988 p += j - i;
5989 }
5990 /* copy substitution string */
5991 if (str2->length > 0) {
5992 Py_UNICODE_COPY(p, str2->str, str2->length);
5993 p += str2->length;
5994 }
5995 i = j + str1->length;
5996 }
5997 if (i < self->length)
5998 /* copy tail [i:] */
5999 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6000 } else {
6001 /* interleave */
6002 while (n > 0) {
6003 Py_UNICODE_COPY(p, str2->str, str2->length);
6004 p += str2->length;
6005 if (--n <= 0)
6006 break;
6007 *p++ = self->str[i++];
6008 }
6009 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006013
6014nothing:
6015 /* nothing to replace; return original string (when possible) */
6016 if (PyUnicode_CheckExact(self)) {
6017 Py_INCREF(self);
6018 return (PyObject *) self;
6019 }
6020 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021}
6022
6023/* --- Unicode Object Methods --------------------------------------------- */
6024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006025PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026"S.title() -> unicode\n\
6027\n\
6028Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006029characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030
6031static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006032unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034 return fixup(self, fixtitle);
6035}
6036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006037PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038"S.capitalize() -> unicode\n\
6039\n\
6040Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006041have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042
6043static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006044unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 return fixup(self, fixcapitalize);
6047}
6048
6049#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006050PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051"S.capwords() -> unicode\n\
6052\n\
6053Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006054normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055
6056static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006057unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058{
6059 PyObject *list;
6060 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006061 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 /* Split into words */
6064 list = split(self, NULL, -1);
6065 if (!list)
6066 return NULL;
6067
6068 /* Capitalize each word */
6069 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6070 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6071 fixcapitalize);
6072 if (item == NULL)
6073 goto onError;
6074 Py_DECREF(PyList_GET_ITEM(list, i));
6075 PyList_SET_ITEM(list, i, item);
6076 }
6077
6078 /* Join the words to form a new string */
6079 item = PyUnicode_Join(NULL, list);
6080
6081onError:
6082 Py_DECREF(list);
6083 return (PyObject *)item;
6084}
6085#endif
6086
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006087/* Argument converter. Coerces to a single unicode character */
6088
6089static int
6090convert_uc(PyObject *obj, void *addr)
6091{
6092 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6093 PyObject *uniobj;
6094 Py_UNICODE *unistr;
6095
6096 uniobj = PyUnicode_FromObject(obj);
6097 if (uniobj == NULL) {
6098 PyErr_SetString(PyExc_TypeError,
6099 "The fill character cannot be converted to Unicode");
6100 return 0;
6101 }
6102 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6103 PyErr_SetString(PyExc_TypeError,
6104 "The fill character must be exactly one character long");
6105 Py_DECREF(uniobj);
6106 return 0;
6107 }
6108 unistr = PyUnicode_AS_UNICODE(uniobj);
6109 *fillcharloc = unistr[0];
6110 Py_DECREF(uniobj);
6111 return 1;
6112}
6113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006114PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006115"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006117Return S centered in a Unicode string of length width. Padding is\n\
6118done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119
6120static PyObject *
6121unicode_center(PyUnicodeObject *self, PyObject *args)
6122{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006123 Py_ssize_t marg, left;
6124 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006125 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126
Thomas Woutersde017742006-02-16 19:34:37 +00006127 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 return NULL;
6129
Tim Peters7a29bd52001-09-12 03:03:31 +00006130 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 Py_INCREF(self);
6132 return (PyObject*) self;
6133 }
6134
6135 marg = width - self->length;
6136 left = marg / 2 + (marg & width & 1);
6137
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006138 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139}
6140
Marc-André Lemburge5034372000-08-08 08:04:29 +00006141#if 0
6142
6143/* This code should go into some future Unicode collation support
6144 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006145 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006146
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006147/* speedy UTF-16 code point order comparison */
6148/* gleaned from: */
6149/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6150
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006151static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006152{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006153 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006154 0, 0, 0, 0, 0, 0, 0, 0,
6155 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006156 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006157};
6158
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159static int
6160unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6161{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006162 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006163
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 Py_UNICODE *s1 = str1->str;
6165 Py_UNICODE *s2 = str2->str;
6166
6167 len1 = str1->length;
6168 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006169
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006171 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006172
6173 c1 = *s1++;
6174 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006175
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006176 if (c1 > (1<<11) * 26)
6177 c1 += utf16Fixup[c1>>11];
6178 if (c2 > (1<<11) * 26)
6179 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006180 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006181
6182 if (c1 != c2)
6183 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006184
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006185 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 }
6187
6188 return (len1 < len2) ? -1 : (len1 != len2);
6189}
6190
Marc-André Lemburge5034372000-08-08 08:04:29 +00006191#else
6192
6193static int
6194unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6195{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006196 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006197
6198 Py_UNICODE *s1 = str1->str;
6199 Py_UNICODE *s2 = str2->str;
6200
6201 len1 = str1->length;
6202 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006203
Marc-André Lemburge5034372000-08-08 08:04:29 +00006204 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006205 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006206
Fredrik Lundh45714e92001-06-26 16:39:36 +00006207 c1 = *s1++;
6208 c2 = *s2++;
6209
6210 if (c1 != c2)
6211 return (c1 < c2) ? -1 : 1;
6212
Marc-André Lemburge5034372000-08-08 08:04:29 +00006213 len1--; len2--;
6214 }
6215
6216 return (len1 < len2) ? -1 : (len1 != len2);
6217}
6218
6219#endif
6220
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221int PyUnicode_Compare(PyObject *left,
6222 PyObject *right)
6223{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006224 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6225 return unicode_compare((PyUnicodeObject *)left,
6226 (PyUnicodeObject *)right);
6227 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
6228 (PyUnicode_Check(left) && PyString_Check(right))) {
6229 if (PyUnicode_Check(left))
6230 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
6231 if (PyUnicode_Check(right))
6232 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
6233 assert(PyString_Check(left));
6234 assert(PyString_Check(right));
6235 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006237 PyErr_Format(PyExc_TypeError,
6238 "Can't compare %.100s and %.100s",
6239 left->ob_type->tp_name,
6240 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 return -1;
6242}
6243
Martin v. Löwis5b222132007-06-10 09:51:05 +00006244int
6245PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6246{
6247 int i;
6248 Py_UNICODE *id;
6249 assert(PyUnicode_Check(uni));
6250 id = PyUnicode_AS_UNICODE(uni);
6251 /* Compare Unicode string and source character set string */
6252 for (i = 0; id[i] && str[i]; i++)
6253 if (id[i] != str[i])
6254 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6255 if (id[i])
6256 return 1; /* uni is longer */
6257 if (str[i])
6258 return -1; /* str is longer */
6259 return 0;
6260}
6261
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006262PyObject *PyUnicode_RichCompare(PyObject *left,
6263 PyObject *right,
6264 int op)
6265{
6266 int result;
6267
6268 result = PyUnicode_Compare(left, right);
6269 if (result == -1 && PyErr_Occurred())
6270 goto onError;
6271
6272 /* Convert the return value to a Boolean */
6273 switch (op) {
6274 case Py_EQ:
6275 result = (result == 0);
6276 break;
6277 case Py_NE:
6278 result = (result != 0);
6279 break;
6280 case Py_LE:
6281 result = (result <= 0);
6282 break;
6283 case Py_GE:
6284 result = (result >= 0);
6285 break;
6286 case Py_LT:
6287 result = (result == -1);
6288 break;
6289 case Py_GT:
6290 result = (result == 1);
6291 break;
6292 }
6293 return PyBool_FromLong(result);
6294
6295 onError:
6296
6297 /* Standard case
6298
6299 Type errors mean that PyUnicode_FromObject() could not convert
6300 one of the arguments (usually the right hand side) to Unicode,
6301 ie. we can't handle the comparison request. However, it is
6302 possible that the other object knows a comparison method, which
6303 is why we return Py_NotImplemented to give the other object a
6304 chance.
6305
6306 */
6307 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6308 PyErr_Clear();
6309 Py_INCREF(Py_NotImplemented);
6310 return Py_NotImplemented;
6311 }
6312 if (op != Py_EQ && op != Py_NE)
6313 return NULL;
6314
6315 /* Equality comparison.
6316
6317 This is a special case: we silence any PyExc_UnicodeDecodeError
6318 and instead turn it into a PyErr_UnicodeWarning.
6319
6320 */
6321 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6322 return NULL;
6323 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006324 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6325 (op == Py_EQ) ?
6326 "Unicode equal comparison "
6327 "failed to convert both arguments to Unicode - "
6328 "interpreting them as being unequal"
6329 :
6330 "Unicode unequal comparison "
6331 "failed to convert both arguments to Unicode - "
6332 "interpreting them as being unequal",
6333 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006334 return NULL;
6335 result = (op == Py_NE);
6336 return PyBool_FromLong(result);
6337}
6338
Guido van Rossum403d68b2000-03-13 15:55:09 +00006339int PyUnicode_Contains(PyObject *container,
6340 PyObject *element)
6341{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006342 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006343 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006344
6345 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006346 sub = PyUnicode_FromObject(element);
6347 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006348 PyErr_Format(PyExc_TypeError,
6349 "'in <string>' requires string as left operand, not %s",
6350 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006351 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006352 }
6353
Thomas Wouters477c8d52006-05-27 19:21:47 +00006354 str = PyUnicode_FromObject(container);
6355 if (!str) {
6356 Py_DECREF(sub);
6357 return -1;
6358 }
6359
6360 result = stringlib_contains_obj(str, sub);
6361
6362 Py_DECREF(str);
6363 Py_DECREF(sub);
6364
Guido van Rossum403d68b2000-03-13 15:55:09 +00006365 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006366}
6367
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368/* Concat to string or Unicode object giving a new Unicode object. */
6369
6370PyObject *PyUnicode_Concat(PyObject *left,
6371 PyObject *right)
6372{
6373 PyUnicodeObject *u = NULL, *v = NULL, *w;
6374
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006375 if (PyBytes_Check(left) || PyBytes_Check(right))
6376 return PyBytes_Concat(left, right);
6377
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 /* Coerce the two arguments */
6379 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6380 if (u == NULL)
6381 goto onError;
6382 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6383 if (v == NULL)
6384 goto onError;
6385
6386 /* Shortcuts */
6387 if (v == unicode_empty) {
6388 Py_DECREF(v);
6389 return (PyObject *)u;
6390 }
6391 if (u == unicode_empty) {
6392 Py_DECREF(u);
6393 return (PyObject *)v;
6394 }
6395
6396 /* Concat the two Unicode strings */
6397 w = _PyUnicode_New(u->length + v->length);
6398 if (w == NULL)
6399 goto onError;
6400 Py_UNICODE_COPY(w->str, u->str, u->length);
6401 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6402
6403 Py_DECREF(u);
6404 Py_DECREF(v);
6405 return (PyObject *)w;
6406
6407onError:
6408 Py_XDECREF(u);
6409 Py_XDECREF(v);
6410 return NULL;
6411}
6412
Walter Dörwald1ab83302007-05-18 17:15:44 +00006413void
6414PyUnicode_Append(PyObject **pleft, PyObject *right)
6415{
6416 PyObject *new;
6417 if (*pleft == NULL)
6418 return;
6419 if (right == NULL || !PyUnicode_Check(*pleft)) {
6420 Py_DECREF(*pleft);
6421 *pleft = NULL;
6422 return;
6423 }
6424 new = PyUnicode_Concat(*pleft, right);
6425 Py_DECREF(*pleft);
6426 *pleft = new;
6427}
6428
6429void
6430PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6431{
6432 PyUnicode_Append(pleft, right);
6433 Py_XDECREF(right);
6434}
6435
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006436PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437"S.count(sub[, start[, end]]) -> int\n\
6438\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006439Return the number of non-overlapping occurrences of substring sub in\n\
6440Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006441interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442
6443static PyObject *
6444unicode_count(PyUnicodeObject *self, PyObject *args)
6445{
6446 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006447 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006448 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449 PyObject *result;
6450
Guido van Rossumb8872e62000-05-09 14:14:27 +00006451 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6452 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 return NULL;
6454
6455 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006456 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 if (substring == NULL)
6458 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006459
Thomas Wouters477c8d52006-05-27 19:21:47 +00006460 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461
Thomas Wouters477c8d52006-05-27 19:21:47 +00006462 result = PyInt_FromSsize_t(
6463 stringlib_count(self->str + start, end - start,
6464 substring->str, substring->length)
6465 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466
6467 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006468
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 return result;
6470}
6471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006472PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006473"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006475Encodes S using the codec registered for encoding. encoding defaults\n\
6476to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006477handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006478a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6479'xmlcharrefreplace' as well as any other name registered with\n\
6480codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481
6482static PyObject *
6483unicode_encode(PyUnicodeObject *self, PyObject *args)
6484{
6485 char *encoding = NULL;
6486 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006487 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006488
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6490 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006491 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006492 if (v == NULL)
6493 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006494 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006495 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006496 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006497 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006498 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006499 Py_DECREF(v);
6500 return NULL;
6501 }
6502 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006503
6504 onError:
6505 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006506}
6507
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006508PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509"S.expandtabs([tabsize]) -> unicode\n\
6510\n\
6511Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006512If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513
6514static PyObject*
6515unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6516{
6517 Py_UNICODE *e;
6518 Py_UNICODE *p;
6519 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006520 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521 PyUnicodeObject *u;
6522 int tabsize = 8;
6523
6524 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6525 return NULL;
6526
Thomas Wouters7e474022000-07-16 12:04:32 +00006527 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006528 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529 e = self->str + self->length;
6530 for (p = self->str; p < e; p++)
6531 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006532 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006534 if (old_j > j) {
6535 PyErr_SetString(PyExc_OverflowError,
6536 "new string is too long");
6537 return NULL;
6538 }
6539 old_j = j;
6540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541 }
6542 else {
6543 j++;
6544 if (*p == '\n' || *p == '\r') {
6545 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006546 old_j = j = 0;
6547 if (i < 0) {
6548 PyErr_SetString(PyExc_OverflowError,
6549 "new string is too long");
6550 return NULL;
6551 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 }
6553 }
6554
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006555 if ((i + j) < 0) {
6556 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6557 return NULL;
6558 }
6559
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560 /* Second pass: create output string and fill it */
6561 u = _PyUnicode_New(i + j);
6562 if (!u)
6563 return NULL;
6564
6565 j = 0;
6566 q = u->str;
6567
6568 for (p = self->str; p < e; p++)
6569 if (*p == '\t') {
6570 if (tabsize > 0) {
6571 i = tabsize - (j % tabsize);
6572 j += i;
6573 while (i--)
6574 *q++ = ' ';
6575 }
6576 }
6577 else {
6578 j++;
6579 *q++ = *p;
6580 if (*p == '\n' || *p == '\r')
6581 j = 0;
6582 }
6583
6584 return (PyObject*) u;
6585}
6586
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006587PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588"S.find(sub [,start [,end]]) -> int\n\
6589\n\
6590Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006591such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592arguments start and end are interpreted as in slice notation.\n\
6593\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006594Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595
6596static PyObject *
6597unicode_find(PyUnicodeObject *self, PyObject *args)
6598{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006599 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006600 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006601 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006602 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603
Guido van Rossumb8872e62000-05-09 14:14:27 +00006604 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6605 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006607 substring = PyUnicode_FromObject(substring);
6608 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 return NULL;
6610
Thomas Wouters477c8d52006-05-27 19:21:47 +00006611 result = stringlib_find_slice(
6612 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6613 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6614 start, end
6615 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616
6617 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006618
6619 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620}
6621
6622static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006623unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624{
6625 if (index < 0 || index >= self->length) {
6626 PyErr_SetString(PyExc_IndexError, "string index out of range");
6627 return NULL;
6628 }
6629
6630 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6631}
6632
Guido van Rossumc2504932007-09-18 19:42:40 +00006633/* Believe it or not, this produces the same value for ASCII strings
6634 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006636unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637{
Guido van Rossumc2504932007-09-18 19:42:40 +00006638 Py_ssize_t len;
6639 Py_UNICODE *p;
6640 long x;
6641
6642 if (self->hash != -1)
6643 return self->hash;
6644 len = Py_Size(self);
6645 p = self->str;
6646 x = *p << 7;
6647 while (--len >= 0)
6648 x = (1000003*x) ^ *p++;
6649 x ^= Py_Size(self);
6650 if (x == -1)
6651 x = -2;
6652 self->hash = x;
6653 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654}
6655
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006656PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657"S.index(sub [,start [,end]]) -> int\n\
6658\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006659Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660
6661static PyObject *
6662unicode_index(PyUnicodeObject *self, PyObject *args)
6663{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006664 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006665 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006666 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006667 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668
Guido van Rossumb8872e62000-05-09 14:14:27 +00006669 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6670 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006672 substring = PyUnicode_FromObject(substring);
6673 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674 return NULL;
6675
Thomas Wouters477c8d52006-05-27 19:21:47 +00006676 result = stringlib_find_slice(
6677 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6678 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6679 start, end
6680 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681
6682 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006683
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 if (result < 0) {
6685 PyErr_SetString(PyExc_ValueError, "substring not found");
6686 return NULL;
6687 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006688
Martin v. Löwis18e16552006-02-15 17:27:45 +00006689 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690}
6691
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006692PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006693"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006695Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006696at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697
6698static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006699unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700{
6701 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6702 register const Py_UNICODE *e;
6703 int cased;
6704
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705 /* Shortcut for single character strings */
6706 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006707 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006709 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006710 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006711 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006712
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 e = p + PyUnicode_GET_SIZE(self);
6714 cased = 0;
6715 for (; p < e; p++) {
6716 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006717
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006719 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720 else if (!cased && Py_UNICODE_ISLOWER(ch))
6721 cased = 1;
6722 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006723 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724}
6725
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006726PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006727"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006729Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006730at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731
6732static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006733unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734{
6735 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6736 register const Py_UNICODE *e;
6737 int cased;
6738
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 /* Shortcut for single character strings */
6740 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006741 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006743 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006744 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006745 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006746
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 e = p + PyUnicode_GET_SIZE(self);
6748 cased = 0;
6749 for (; p < e; p++) {
6750 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006751
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006753 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754 else if (!cased && Py_UNICODE_ISUPPER(ch))
6755 cased = 1;
6756 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006757 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758}
6759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006760PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006761"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006763Return True if S is a titlecased string and there is at least one\n\
6764character in S, i.e. upper- and titlecase characters may only\n\
6765follow uncased characters and lowercase characters only cased ones.\n\
6766Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767
6768static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006769unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770{
6771 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6772 register const Py_UNICODE *e;
6773 int cased, previous_is_cased;
6774
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 /* Shortcut for single character strings */
6776 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006777 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6778 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006780 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006781 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006782 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006783
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784 e = p + PyUnicode_GET_SIZE(self);
6785 cased = 0;
6786 previous_is_cased = 0;
6787 for (; p < e; p++) {
6788 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006789
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6791 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006792 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793 previous_is_cased = 1;
6794 cased = 1;
6795 }
6796 else if (Py_UNICODE_ISLOWER(ch)) {
6797 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006798 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799 previous_is_cased = 1;
6800 cased = 1;
6801 }
6802 else
6803 previous_is_cased = 0;
6804 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006805 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806}
6807
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006808PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006809"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006811Return True if all characters in S are whitespace\n\
6812and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813
6814static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006815unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816{
6817 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6818 register const Py_UNICODE *e;
6819
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 /* Shortcut for single character strings */
6821 if (PyUnicode_GET_SIZE(self) == 1 &&
6822 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006823 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006825 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006826 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006827 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006828
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829 e = p + PyUnicode_GET_SIZE(self);
6830 for (; p < e; p++) {
6831 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006832 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006834 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835}
6836
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006837PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006838"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006839\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006840Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006841and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006842
6843static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006844unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006845{
6846 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6847 register const Py_UNICODE *e;
6848
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006849 /* Shortcut for single character strings */
6850 if (PyUnicode_GET_SIZE(self) == 1 &&
6851 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006852 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006853
6854 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006855 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006856 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006857
6858 e = p + PyUnicode_GET_SIZE(self);
6859 for (; p < e; p++) {
6860 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006861 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006862 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006863 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006864}
6865
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006866PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006867"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006868\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006869Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006870and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006871
6872static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006873unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006874{
6875 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6876 register const Py_UNICODE *e;
6877
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006878 /* Shortcut for single character strings */
6879 if (PyUnicode_GET_SIZE(self) == 1 &&
6880 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006881 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006882
6883 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006884 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006885 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006886
6887 e = p + PyUnicode_GET_SIZE(self);
6888 for (; p < e; p++) {
6889 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006890 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006891 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006892 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006893}
6894
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006895PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006896"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006898Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006899False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900
6901static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006902unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903{
6904 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6905 register const Py_UNICODE *e;
6906
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 /* Shortcut for single character strings */
6908 if (PyUnicode_GET_SIZE(self) == 1 &&
6909 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006910 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006912 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006913 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006914 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006915
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 e = p + PyUnicode_GET_SIZE(self);
6917 for (; p < e; p++) {
6918 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006919 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006921 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922}
6923
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006924PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006925"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006927Return True if all characters in S are digits\n\
6928and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929
6930static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006931unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932{
6933 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6934 register const Py_UNICODE *e;
6935
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936 /* Shortcut for single character strings */
6937 if (PyUnicode_GET_SIZE(self) == 1 &&
6938 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006939 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006941 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006942 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006943 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006944
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 e = p + PyUnicode_GET_SIZE(self);
6946 for (; p < e; p++) {
6947 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006948 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006950 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951}
6952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006953PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006954"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006956Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006957False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958
6959static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006960unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961{
6962 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6963 register const Py_UNICODE *e;
6964
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965 /* Shortcut for single character strings */
6966 if (PyUnicode_GET_SIZE(self) == 1 &&
6967 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006968 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006970 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006971 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006972 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006973
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974 e = p + PyUnicode_GET_SIZE(self);
6975 for (; p < e; p++) {
6976 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006977 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006979 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980}
6981
Martin v. Löwis47383402007-08-15 07:32:56 +00006982int
6983PyUnicode_IsIdentifier(PyObject *self)
6984{
6985 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6986 register const Py_UNICODE *e;
6987
6988 /* Special case for empty strings */
6989 if (PyUnicode_GET_SIZE(self) == 0)
6990 return 0;
6991
6992 /* PEP 3131 says that the first character must be in
6993 XID_Start and subsequent characters in XID_Continue,
6994 and for the ASCII range, the 2.x rules apply (i.e
6995 start with letters and underscore, continue with
6996 letters, digits, underscore). However, given the current
6997 definition of XID_Start and XID_Continue, it is sufficient
6998 to check just for these, except that _ must be allowed
6999 as starting an identifier. */
7000 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7001 return 0;
7002
7003 e = p + PyUnicode_GET_SIZE(self);
7004 for (p++; p < e; p++) {
7005 if (!_PyUnicode_IsXidContinue(*p))
7006 return 0;
7007 }
7008 return 1;
7009}
7010
7011PyDoc_STRVAR(isidentifier__doc__,
7012"S.isidentifier() -> bool\n\
7013\n\
7014Return True if S is a valid identifier according\n\
7015to the language definition.");
7016
7017static PyObject*
7018unicode_isidentifier(PyObject *self)
7019{
7020 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7021}
7022
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007023PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024"S.join(sequence) -> unicode\n\
7025\n\
7026Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007027sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028
7029static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007030unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007032 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033}
7034
Martin v. Löwis18e16552006-02-15 17:27:45 +00007035static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036unicode_length(PyUnicodeObject *self)
7037{
7038 return self->length;
7039}
7040
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007041PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007042"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043\n\
7044Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007045done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046
7047static PyObject *
7048unicode_ljust(PyUnicodeObject *self, PyObject *args)
7049{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007050 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007051 Py_UNICODE fillchar = ' ';
7052
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007053 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054 return NULL;
7055
Tim Peters7a29bd52001-09-12 03:03:31 +00007056 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057 Py_INCREF(self);
7058 return (PyObject*) self;
7059 }
7060
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007061 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062}
7063
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007064PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065"S.lower() -> unicode\n\
7066\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007067Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068
7069static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007070unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 return fixup(self, fixlower);
7073}
7074
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007075#define LEFTSTRIP 0
7076#define RIGHTSTRIP 1
7077#define BOTHSTRIP 2
7078
7079/* Arrays indexed by above */
7080static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7081
7082#define STRIPNAME(i) (stripformat[i]+3)
7083
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007084/* externally visible for str.strip(unicode) */
7085PyObject *
7086_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7087{
7088 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007089 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007090 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007091 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7092 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007093
Thomas Wouters477c8d52006-05-27 19:21:47 +00007094 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7095
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007096 i = 0;
7097 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007098 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7099 i++;
7100 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007101 }
7102
7103 j = len;
7104 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007105 do {
7106 j--;
7107 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7108 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007109 }
7110
7111 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007112 Py_INCREF(self);
7113 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007114 }
7115 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007116 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007117}
7118
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119
7120static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007121do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007123 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007124 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007125
7126 i = 0;
7127 if (striptype != RIGHTSTRIP) {
7128 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7129 i++;
7130 }
7131 }
7132
7133 j = len;
7134 if (striptype != LEFTSTRIP) {
7135 do {
7136 j--;
7137 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7138 j++;
7139 }
7140
7141 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7142 Py_INCREF(self);
7143 return (PyObject*)self;
7144 }
7145 else
7146 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147}
7148
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007149
7150static PyObject *
7151do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7152{
7153 PyObject *sep = NULL;
7154
7155 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7156 return NULL;
7157
7158 if (sep != NULL && sep != Py_None) {
7159 if (PyUnicode_Check(sep))
7160 return _PyUnicode_XStrip(self, striptype, sep);
7161 else if (PyString_Check(sep)) {
7162 PyObject *res;
7163 sep = PyUnicode_FromObject(sep);
7164 if (sep==NULL)
7165 return NULL;
7166 res = _PyUnicode_XStrip(self, striptype, sep);
7167 Py_DECREF(sep);
7168 return res;
7169 }
7170 else {
7171 PyErr_Format(PyExc_TypeError,
7172 "%s arg must be None, unicode or str",
7173 STRIPNAME(striptype));
7174 return NULL;
7175 }
7176 }
7177
7178 return do_strip(self, striptype);
7179}
7180
7181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007182PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007183"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007184\n\
7185Return a copy of the string S with leading and trailing\n\
7186whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007187If chars is given and not None, remove characters in chars instead.\n\
7188If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007189
7190static PyObject *
7191unicode_strip(PyUnicodeObject *self, PyObject *args)
7192{
7193 if (PyTuple_GET_SIZE(args) == 0)
7194 return do_strip(self, BOTHSTRIP); /* Common case */
7195 else
7196 return do_argstrip(self, BOTHSTRIP, args);
7197}
7198
7199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007200PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007201"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007202\n\
7203Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007204If chars is given and not None, remove characters in chars instead.\n\
7205If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007206
7207static PyObject *
7208unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7209{
7210 if (PyTuple_GET_SIZE(args) == 0)
7211 return do_strip(self, LEFTSTRIP); /* Common case */
7212 else
7213 return do_argstrip(self, LEFTSTRIP, args);
7214}
7215
7216
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007217PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007218"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007219\n\
7220Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007221If chars is given and not None, remove characters in chars instead.\n\
7222If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007223
7224static PyObject *
7225unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7226{
7227 if (PyTuple_GET_SIZE(args) == 0)
7228 return do_strip(self, RIGHTSTRIP); /* Common case */
7229 else
7230 return do_argstrip(self, RIGHTSTRIP, args);
7231}
7232
7233
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007235unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236{
7237 PyUnicodeObject *u;
7238 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007239 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007240 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241
7242 if (len < 0)
7243 len = 0;
7244
Tim Peters7a29bd52001-09-12 03:03:31 +00007245 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246 /* no repeat, return original string */
7247 Py_INCREF(str);
7248 return (PyObject*) str;
7249 }
Tim Peters8f422462000-09-09 06:13:41 +00007250
7251 /* ensure # of chars needed doesn't overflow int and # of bytes
7252 * needed doesn't overflow size_t
7253 */
7254 nchars = len * str->length;
7255 if (len && nchars / len != str->length) {
7256 PyErr_SetString(PyExc_OverflowError,
7257 "repeated string is too long");
7258 return NULL;
7259 }
7260 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7261 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7262 PyErr_SetString(PyExc_OverflowError,
7263 "repeated string is too long");
7264 return NULL;
7265 }
7266 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267 if (!u)
7268 return NULL;
7269
7270 p = u->str;
7271
Thomas Wouters477c8d52006-05-27 19:21:47 +00007272 if (str->length == 1 && len > 0) {
7273 Py_UNICODE_FILL(p, str->str[0], len);
7274 } else {
7275 Py_ssize_t done = 0; /* number of characters copied this far */
7276 if (done < nchars) {
7277 Py_UNICODE_COPY(p, str->str, str->length);
7278 done = str->length;
7279 }
7280 while (done < nchars) {
7281 int n = (done <= nchars-done) ? done : nchars-done;
7282 Py_UNICODE_COPY(p+done, p, n);
7283 done += n;
7284 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285 }
7286
7287 return (PyObject*) u;
7288}
7289
7290PyObject *PyUnicode_Replace(PyObject *obj,
7291 PyObject *subobj,
7292 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007293 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294{
7295 PyObject *self;
7296 PyObject *str1;
7297 PyObject *str2;
7298 PyObject *result;
7299
7300 self = PyUnicode_FromObject(obj);
7301 if (self == NULL)
7302 return NULL;
7303 str1 = PyUnicode_FromObject(subobj);
7304 if (str1 == NULL) {
7305 Py_DECREF(self);
7306 return NULL;
7307 }
7308 str2 = PyUnicode_FromObject(replobj);
7309 if (str2 == NULL) {
7310 Py_DECREF(self);
7311 Py_DECREF(str1);
7312 return NULL;
7313 }
Tim Petersced69f82003-09-16 20:30:58 +00007314 result = replace((PyUnicodeObject *)self,
7315 (PyUnicodeObject *)str1,
7316 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317 maxcount);
7318 Py_DECREF(self);
7319 Py_DECREF(str1);
7320 Py_DECREF(str2);
7321 return result;
7322}
7323
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007324PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325"S.replace (old, new[, maxsplit]) -> unicode\n\
7326\n\
7327Return a copy of S with all occurrences of substring\n\
7328old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007329given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330
7331static PyObject*
7332unicode_replace(PyUnicodeObject *self, PyObject *args)
7333{
7334 PyUnicodeObject *str1;
7335 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007336 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337 PyObject *result;
7338
Martin v. Löwis18e16552006-02-15 17:27:45 +00007339 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340 return NULL;
7341 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7342 if (str1 == NULL)
7343 return NULL;
7344 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007345 if (str2 == NULL) {
7346 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349
7350 result = replace(self, str1, str2, maxcount);
7351
7352 Py_DECREF(str1);
7353 Py_DECREF(str2);
7354 return result;
7355}
7356
7357static
7358PyObject *unicode_repr(PyObject *unicode)
7359{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007360 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007361 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007362 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7363 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7364
7365 /* XXX(nnorwitz): rather than over-allocating, it would be
7366 better to choose a different scheme. Perhaps scan the
7367 first N-chars of the string and allocate based on that size.
7368 */
7369 /* Initial allocation is based on the longest-possible unichr
7370 escape.
7371
7372 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7373 unichr, so in this case it's the longest unichr escape. In
7374 narrow (UTF-16) builds this is five chars per source unichr
7375 since there are two unichrs in the surrogate pair, so in narrow
7376 (UTF-16) builds it's not the longest unichr escape.
7377
7378 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7379 so in the narrow (UTF-16) build case it's the longest unichr
7380 escape.
7381 */
7382
Walter Dörwald1ab83302007-05-18 17:15:44 +00007383 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007384 2 /* quotes */
7385#ifdef Py_UNICODE_WIDE
7386 + 10*size
7387#else
7388 + 6*size
7389#endif
7390 + 1);
7391 if (repr == NULL)
7392 return NULL;
7393
Walter Dörwald1ab83302007-05-18 17:15:44 +00007394 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007395
7396 /* Add quote */
7397 *p++ = (findchar(s, size, '\'') &&
7398 !findchar(s, size, '"')) ? '"' : '\'';
7399 while (size-- > 0) {
7400 Py_UNICODE ch = *s++;
7401
7402 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007403 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007404 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007405 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007406 continue;
7407 }
7408
7409#ifdef Py_UNICODE_WIDE
7410 /* Map 21-bit characters to '\U00xxxxxx' */
7411 else if (ch >= 0x10000) {
7412 *p++ = '\\';
7413 *p++ = 'U';
7414 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7415 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7416 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7417 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7418 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7419 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7420 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7421 *p++ = hexdigits[ch & 0x0000000F];
7422 continue;
7423 }
7424#else
7425 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7426 else if (ch >= 0xD800 && ch < 0xDC00) {
7427 Py_UNICODE ch2;
7428 Py_UCS4 ucs;
7429
7430 ch2 = *s++;
7431 size--;
7432 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7433 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7434 *p++ = '\\';
7435 *p++ = 'U';
7436 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7437 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7438 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7439 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7440 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7441 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7442 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7443 *p++ = hexdigits[ucs & 0x0000000F];
7444 continue;
7445 }
7446 /* Fall through: isolated surrogates are copied as-is */
7447 s--;
7448 size++;
7449 }
7450#endif
7451
7452 /* Map 16-bit characters to '\uxxxx' */
7453 if (ch >= 256) {
7454 *p++ = '\\';
7455 *p++ = 'u';
7456 *p++ = hexdigits[(ch >> 12) & 0x000F];
7457 *p++ = hexdigits[(ch >> 8) & 0x000F];
7458 *p++ = hexdigits[(ch >> 4) & 0x000F];
7459 *p++ = hexdigits[ch & 0x000F];
7460 }
7461
7462 /* Map special whitespace to '\t', \n', '\r' */
7463 else if (ch == '\t') {
7464 *p++ = '\\';
7465 *p++ = 't';
7466 }
7467 else if (ch == '\n') {
7468 *p++ = '\\';
7469 *p++ = 'n';
7470 }
7471 else if (ch == '\r') {
7472 *p++ = '\\';
7473 *p++ = 'r';
7474 }
7475
7476 /* Map non-printable US ASCII to '\xhh' */
7477 else if (ch < ' ' || ch >= 0x7F) {
7478 *p++ = '\\';
7479 *p++ = 'x';
7480 *p++ = hexdigits[(ch >> 4) & 0x000F];
7481 *p++ = hexdigits[ch & 0x000F];
7482 }
7483
7484 /* Copy everything else as-is */
7485 else
7486 *p++ = (char) ch;
7487 }
7488 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007489 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007490
7491 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007492 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007493 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494}
7495
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007496PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497"S.rfind(sub [,start [,end]]) -> int\n\
7498\n\
7499Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007500such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501arguments start and end are interpreted as in slice notation.\n\
7502\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007503Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504
7505static PyObject *
7506unicode_rfind(PyUnicodeObject *self, PyObject *args)
7507{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007508 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007509 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007510 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007511 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512
Guido van Rossumb8872e62000-05-09 14:14:27 +00007513 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7514 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007516 substring = PyUnicode_FromObject(substring);
7517 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518 return NULL;
7519
Thomas Wouters477c8d52006-05-27 19:21:47 +00007520 result = stringlib_rfind_slice(
7521 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7522 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7523 start, end
7524 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525
7526 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007527
7528 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529}
7530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007531PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532"S.rindex(sub [,start [,end]]) -> int\n\
7533\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007534Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535
7536static PyObject *
7537unicode_rindex(PyUnicodeObject *self, PyObject *args)
7538{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007539 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007540 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007541 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007542 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543
Guido van Rossumb8872e62000-05-09 14:14:27 +00007544 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7545 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007547 substring = PyUnicode_FromObject(substring);
7548 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 return NULL;
7550
Thomas Wouters477c8d52006-05-27 19:21:47 +00007551 result = stringlib_rfind_slice(
7552 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7553 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7554 start, end
7555 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556
7557 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007558
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559 if (result < 0) {
7560 PyErr_SetString(PyExc_ValueError, "substring not found");
7561 return NULL;
7562 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007563 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564}
7565
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007566PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007567"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568\n\
7569Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007570done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571
7572static PyObject *
7573unicode_rjust(PyUnicodeObject *self, PyObject *args)
7574{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007575 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007576 Py_UNICODE fillchar = ' ';
7577
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007578 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579 return NULL;
7580
Tim Peters7a29bd52001-09-12 03:03:31 +00007581 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582 Py_INCREF(self);
7583 return (PyObject*) self;
7584 }
7585
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007586 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587}
7588
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589PyObject *PyUnicode_Split(PyObject *s,
7590 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007591 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592{
7593 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007594
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595 s = PyUnicode_FromObject(s);
7596 if (s == NULL)
7597 return NULL;
7598 if (sep != NULL) {
7599 sep = PyUnicode_FromObject(sep);
7600 if (sep == NULL) {
7601 Py_DECREF(s);
7602 return NULL;
7603 }
7604 }
7605
7606 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7607
7608 Py_DECREF(s);
7609 Py_XDECREF(sep);
7610 return result;
7611}
7612
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007613PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614"S.split([sep [,maxsplit]]) -> list of strings\n\
7615\n\
7616Return a list of the words in S, using sep as the\n\
7617delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007618splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007619any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620
7621static PyObject*
7622unicode_split(PyUnicodeObject *self, PyObject *args)
7623{
7624 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007625 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626
Martin v. Löwis18e16552006-02-15 17:27:45 +00007627 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628 return NULL;
7629
7630 if (substring == Py_None)
7631 return split(self, NULL, maxcount);
7632 else if (PyUnicode_Check(substring))
7633 return split(self, (PyUnicodeObject *)substring, maxcount);
7634 else
7635 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7636}
7637
Thomas Wouters477c8d52006-05-27 19:21:47 +00007638PyObject *
7639PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7640{
7641 PyObject* str_obj;
7642 PyObject* sep_obj;
7643 PyObject* out;
7644
7645 str_obj = PyUnicode_FromObject(str_in);
7646 if (!str_obj)
7647 return NULL;
7648 sep_obj = PyUnicode_FromObject(sep_in);
7649 if (!sep_obj) {
7650 Py_DECREF(str_obj);
7651 return NULL;
7652 }
7653
7654 out = stringlib_partition(
7655 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7656 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7657 );
7658
7659 Py_DECREF(sep_obj);
7660 Py_DECREF(str_obj);
7661
7662 return out;
7663}
7664
7665
7666PyObject *
7667PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7668{
7669 PyObject* str_obj;
7670 PyObject* sep_obj;
7671 PyObject* out;
7672
7673 str_obj = PyUnicode_FromObject(str_in);
7674 if (!str_obj)
7675 return NULL;
7676 sep_obj = PyUnicode_FromObject(sep_in);
7677 if (!sep_obj) {
7678 Py_DECREF(str_obj);
7679 return NULL;
7680 }
7681
7682 out = stringlib_rpartition(
7683 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7684 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7685 );
7686
7687 Py_DECREF(sep_obj);
7688 Py_DECREF(str_obj);
7689
7690 return out;
7691}
7692
7693PyDoc_STRVAR(partition__doc__,
7694"S.partition(sep) -> (head, sep, tail)\n\
7695\n\
7696Searches for the separator sep in S, and returns the part before it,\n\
7697the separator itself, and the part after it. If the separator is not\n\
7698found, returns S and two empty strings.");
7699
7700static PyObject*
7701unicode_partition(PyUnicodeObject *self, PyObject *separator)
7702{
7703 return PyUnicode_Partition((PyObject *)self, separator);
7704}
7705
7706PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007707"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007708\n\
7709Searches for the separator sep in S, starting at the end of S, and returns\n\
7710the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007711separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007712
7713static PyObject*
7714unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7715{
7716 return PyUnicode_RPartition((PyObject *)self, separator);
7717}
7718
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007719PyObject *PyUnicode_RSplit(PyObject *s,
7720 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007721 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007722{
7723 PyObject *result;
7724
7725 s = PyUnicode_FromObject(s);
7726 if (s == NULL)
7727 return NULL;
7728 if (sep != NULL) {
7729 sep = PyUnicode_FromObject(sep);
7730 if (sep == NULL) {
7731 Py_DECREF(s);
7732 return NULL;
7733 }
7734 }
7735
7736 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7737
7738 Py_DECREF(s);
7739 Py_XDECREF(sep);
7740 return result;
7741}
7742
7743PyDoc_STRVAR(rsplit__doc__,
7744"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7745\n\
7746Return a list of the words in S, using sep as the\n\
7747delimiter string, starting at the end of the string and\n\
7748working to the front. If maxsplit is given, at most maxsplit\n\
7749splits are done. If sep is not specified, any whitespace string\n\
7750is a separator.");
7751
7752static PyObject*
7753unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7754{
7755 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007756 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007757
Martin v. Löwis18e16552006-02-15 17:27:45 +00007758 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007759 return NULL;
7760
7761 if (substring == Py_None)
7762 return rsplit(self, NULL, maxcount);
7763 else if (PyUnicode_Check(substring))
7764 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7765 else
7766 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7767}
7768
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007769PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007770"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771\n\
7772Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007773Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007774is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775
7776static PyObject*
7777unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7778{
Guido van Rossum86662912000-04-11 15:38:46 +00007779 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780
Guido van Rossum86662912000-04-11 15:38:46 +00007781 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782 return NULL;
7783
Guido van Rossum86662912000-04-11 15:38:46 +00007784 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785}
7786
7787static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007788PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789{
Walter Dörwald346737f2007-05-31 10:44:43 +00007790 if (PyUnicode_CheckExact(self)) {
7791 Py_INCREF(self);
7792 return self;
7793 } else
7794 /* Subtype -- return genuine unicode string with the same value. */
7795 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7796 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797}
7798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007799PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800"S.swapcase() -> unicode\n\
7801\n\
7802Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007803and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804
7805static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007806unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808 return fixup(self, fixswapcase);
7809}
7810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007811PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812"S.translate(table) -> unicode\n\
7813\n\
7814Return a copy of the string S, where all characters have been mapped\n\
7815through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007816Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7817Unmapped characters are left untouched. Characters mapped to None\n\
7818are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819
7820static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007821unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822{
Tim Petersced69f82003-09-16 20:30:58 +00007823 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007825 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826 "ignore");
7827}
7828
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007829PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830"S.upper() -> unicode\n\
7831\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007832Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833
7834static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007835unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837 return fixup(self, fixupper);
7838}
7839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007840PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841"S.zfill(width) -> unicode\n\
7842\n\
7843Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007844of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845
7846static PyObject *
7847unicode_zfill(PyUnicodeObject *self, PyObject *args)
7848{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007849 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007850 PyUnicodeObject *u;
7851
Martin v. Löwis18e16552006-02-15 17:27:45 +00007852 Py_ssize_t width;
7853 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854 return NULL;
7855
7856 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007857 if (PyUnicode_CheckExact(self)) {
7858 Py_INCREF(self);
7859 return (PyObject*) self;
7860 }
7861 else
7862 return PyUnicode_FromUnicode(
7863 PyUnicode_AS_UNICODE(self),
7864 PyUnicode_GET_SIZE(self)
7865 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866 }
7867
7868 fill = width - self->length;
7869
7870 u = pad(self, fill, 0, '0');
7871
Walter Dörwald068325e2002-04-15 13:36:47 +00007872 if (u == NULL)
7873 return NULL;
7874
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875 if (u->str[fill] == '+' || u->str[fill] == '-') {
7876 /* move sign to beginning of string */
7877 u->str[0] = u->str[fill];
7878 u->str[fill] = '0';
7879 }
7880
7881 return (PyObject*) u;
7882}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883
7884#if 0
7885static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007886unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888 return PyInt_FromLong(unicode_freelist_size);
7889}
7890#endif
7891
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007892PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007893"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007894\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007895Return True if S starts with the specified prefix, False otherwise.\n\
7896With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007897With optional end, stop comparing S at that position.\n\
7898prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007899
7900static PyObject *
7901unicode_startswith(PyUnicodeObject *self,
7902 PyObject *args)
7903{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007904 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007906 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007907 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007908 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007910 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007911 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007913 if (PyTuple_Check(subobj)) {
7914 Py_ssize_t i;
7915 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7916 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7917 PyTuple_GET_ITEM(subobj, i));
7918 if (substring == NULL)
7919 return NULL;
7920 result = tailmatch(self, substring, start, end, -1);
7921 Py_DECREF(substring);
7922 if (result) {
7923 Py_RETURN_TRUE;
7924 }
7925 }
7926 /* nothing matched */
7927 Py_RETURN_FALSE;
7928 }
7929 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007931 return NULL;
7932 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007934 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007935}
7936
7937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007938PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007939"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007941Return True if S ends with the specified suffix, False otherwise.\n\
7942With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007943With optional end, stop comparing S at that position.\n\
7944suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007945
7946static PyObject *
7947unicode_endswith(PyUnicodeObject *self,
7948 PyObject *args)
7949{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007950 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007952 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007953 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007954 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007956 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7957 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007959 if (PyTuple_Check(subobj)) {
7960 Py_ssize_t i;
7961 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7962 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7963 PyTuple_GET_ITEM(subobj, i));
7964 if (substring == NULL)
7965 return NULL;
7966 result = tailmatch(self, substring, start, end, +1);
7967 Py_DECREF(substring);
7968 if (result) {
7969 Py_RETURN_TRUE;
7970 }
7971 }
7972 Py_RETURN_FALSE;
7973 }
7974 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007976 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007978 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007980 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981}
7982
Eric Smith8c663262007-08-25 02:26:07 +00007983#include "stringlib/string_format.h"
7984
7985PyDoc_STRVAR(format__doc__,
7986"S.format(*args, **kwargs) -> unicode\n\
7987\n\
7988");
7989
Eric Smith8c663262007-08-25 02:26:07 +00007990PyDoc_STRVAR(p_format__doc__,
7991"S.__format__(format_spec) -> unicode\n\
7992\n\
7993");
7994
7995static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007996unicode_getnewargs(PyUnicodeObject *v)
7997{
7998 return Py_BuildValue("(u#)", v->str, v->length);
7999}
8000
8001
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002static PyMethodDef unicode_methods[] = {
8003
8004 /* Order is according to common usage: often used methods should
8005 appear first, since lookup is done sequentially. */
8006
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008007 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8008 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8009 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008010 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008011 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8012 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8013 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8014 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8015 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8016 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8017 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008018 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008019 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8020 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8021 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008022 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008023 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8024 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8025 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008026 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008027 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008028 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008029 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008030 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8031 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8032 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8033 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8034 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8035 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8036 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8037 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8038 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8039 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8040 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8041 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8042 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8043 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008044 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008045 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008046 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8047 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008048 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8049 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00008050#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008051 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052#endif
8053
8054#if 0
8055 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008056 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057#endif
8058
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008059 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060 {NULL, NULL}
8061};
8062
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008063static PyObject *
8064unicode_mod(PyObject *v, PyObject *w)
8065{
8066 if (!PyUnicode_Check(v)) {
8067 Py_INCREF(Py_NotImplemented);
8068 return Py_NotImplemented;
8069 }
8070 return PyUnicode_Format(v, w);
8071}
8072
8073static PyNumberMethods unicode_as_number = {
8074 0, /*nb_add*/
8075 0, /*nb_subtract*/
8076 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008077 unicode_mod, /*nb_remainder*/
8078};
8079
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008081 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008082 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008083 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8084 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008085 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086 0, /* sq_ass_item */
8087 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008088 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089};
8090
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008091static PyObject*
8092unicode_subscript(PyUnicodeObject* self, PyObject* item)
8093{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008094 if (PyIndex_Check(item)) {
8095 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008096 if (i == -1 && PyErr_Occurred())
8097 return NULL;
8098 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008099 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008100 return unicode_getitem(self, i);
8101 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008102 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008103 Py_UNICODE* source_buf;
8104 Py_UNICODE* result_buf;
8105 PyObject* result;
8106
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008107 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008108 &start, &stop, &step, &slicelength) < 0) {
8109 return NULL;
8110 }
8111
8112 if (slicelength <= 0) {
8113 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008114 } else if (start == 0 && step == 1 && slicelength == self->length &&
8115 PyUnicode_CheckExact(self)) {
8116 Py_INCREF(self);
8117 return (PyObject *)self;
8118 } else if (step == 1) {
8119 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008120 } else {
8121 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008122 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8123 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008124
8125 if (result_buf == NULL)
8126 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008127
8128 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8129 result_buf[i] = source_buf[cur];
8130 }
Tim Petersced69f82003-09-16 20:30:58 +00008131
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008132 result = PyUnicode_FromUnicode(result_buf, slicelength);
8133 PyMem_FREE(result_buf);
8134 return result;
8135 }
8136 } else {
8137 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8138 return NULL;
8139 }
8140}
8141
8142static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008143 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008144 (binaryfunc)unicode_subscript, /* mp_subscript */
8145 (objobjargproc)0, /* mp_ass_subscript */
8146};
8147
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149/* Helpers for PyUnicode_Format() */
8150
8151static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008152getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008154 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155 if (argidx < arglen) {
8156 (*p_argidx)++;
8157 if (arglen < 0)
8158 return args;
8159 else
8160 return PyTuple_GetItem(args, argidx);
8161 }
8162 PyErr_SetString(PyExc_TypeError,
8163 "not enough arguments for format string");
8164 return NULL;
8165}
8166
8167#define F_LJUST (1<<0)
8168#define F_SIGN (1<<1)
8169#define F_BLANK (1<<2)
8170#define F_ALT (1<<3)
8171#define F_ZERO (1<<4)
8172
Martin v. Löwis18e16552006-02-15 17:27:45 +00008173static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008174strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008176 register Py_ssize_t i;
8177 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178 for (i = len - 1; i >= 0; i--)
8179 buffer[i] = (Py_UNICODE) charbuffer[i];
8180
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181 return len;
8182}
8183
Neal Norwitzfc76d632006-01-10 06:03:13 +00008184static int
8185doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8186{
Tim Peters15231542006-02-16 01:08:01 +00008187 Py_ssize_t result;
8188
Neal Norwitzfc76d632006-01-10 06:03:13 +00008189 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008190 result = strtounicode(buffer, (char *)buffer);
8191 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008192}
8193
8194static int
8195longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8196{
Tim Peters15231542006-02-16 01:08:01 +00008197 Py_ssize_t result;
8198
Neal Norwitzfc76d632006-01-10 06:03:13 +00008199 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008200 result = strtounicode(buffer, (char *)buffer);
8201 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008202}
8203
Guido van Rossum078151d2002-08-11 04:24:12 +00008204/* XXX To save some code duplication, formatfloat/long/int could have been
8205 shared with stringobject.c, converting from 8-bit to Unicode after the
8206 formatting is done. */
8207
Guido van Rossumd57fd912000-03-10 22:53:23 +00008208static int
8209formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008210 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211 int flags,
8212 int prec,
8213 int type,
8214 PyObject *v)
8215{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008216 /* fmt = '%#.' + `prec` + `type`
8217 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218 char fmt[20];
8219 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008220
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221 x = PyFloat_AsDouble(v);
8222 if (x == -1.0 && PyErr_Occurred())
8223 return -1;
8224 if (prec < 0)
8225 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8227 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008228 /* Worst case length calc to ensure no buffer overrun:
8229
8230 'g' formats:
8231 fmt = %#.<prec>g
8232 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8233 for any double rep.)
8234 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8235
8236 'f' formats:
8237 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8238 len = 1 + 50 + 1 + prec = 52 + prec
8239
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008240 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008241 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008242
8243 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008244 if (((type == 'g' || type == 'G') &&
8245 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008246 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008247 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008248 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008249 return -1;
8250 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008251 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8252 (flags&F_ALT) ? "#" : "",
8253 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008254 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255}
8256
Tim Peters38fd5b62000-09-21 05:43:11 +00008257static PyObject*
8258formatlong(PyObject *val, int flags, int prec, int type)
8259{
8260 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008261 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008262 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008263 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008264
8265 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8266 if (!str)
8267 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008268 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008269 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008270 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008271}
8272
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273static int
8274formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008275 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276 int flags,
8277 int prec,
8278 int type,
8279 PyObject *v)
8280{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008281 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008282 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8283 * + 1 + 1
8284 * = 24
8285 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008286 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008287 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288 long x;
8289
8290 x = PyInt_AsLong(v);
8291 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008292 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008293 if (x < 0 && type == 'u') {
8294 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008295 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008296 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8297 sign = "-";
8298 else
8299 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008301 prec = 1;
8302
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008303 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8304 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008305 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008306 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008307 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008308 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008309 return -1;
8310 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008311
8312 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008313 (type == 'x' || type == 'X' || type == 'o')) {
8314 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008315 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008316 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008317 * - when 0 is being converted, the C standard leaves off
8318 * the '0x' or '0X', which is inconsistent with other
8319 * %#x/%#X conversions and inconsistent with Python's
8320 * hex() function
8321 * - there are platforms that violate the standard and
8322 * convert 0 with the '0x' or '0X'
8323 * (Metrowerks, Compaq Tru64)
8324 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008325 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008326 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008327 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008328 * We can achieve the desired consistency by inserting our
8329 * own '0x' or '0X' prefix, and substituting %x/%X in place
8330 * of %#x/%#X.
8331 *
8332 * Note that this is the same approach as used in
8333 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008334 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008335 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8336 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008337 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008338 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008339 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8340 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008341 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008342 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008343 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008344 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008345 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008346 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347}
8348
8349static int
8350formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008351 size_t buflen,
8352 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008354 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008355 if (PyUnicode_Check(v)) {
8356 if (PyUnicode_GET_SIZE(v) != 1)
8357 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008359 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008361 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008362 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008363 goto onError;
8364 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8365 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366
8367 else {
8368 /* Integer input truncated to a character */
8369 long x;
8370 x = PyInt_AsLong(v);
8371 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008372 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008373#ifdef Py_UNICODE_WIDE
8374 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008375 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008376 "%c arg not in range(0x110000) "
8377 "(wide Python build)");
8378 return -1;
8379 }
8380#else
8381 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008382 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008383 "%c arg not in range(0x10000) "
8384 "(narrow Python build)");
8385 return -1;
8386 }
8387#endif
8388 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389 }
8390 buf[1] = '\0';
8391 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008392
8393 onError:
8394 PyErr_SetString(PyExc_TypeError,
8395 "%c requires int or char");
8396 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397}
8398
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008399/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8400
8401 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8402 chars are formatted. XXX This is a magic number. Each formatting
8403 routine does bounds checking to ensure no overflow, but a better
8404 solution may be to malloc a buffer of appropriate size for each
8405 format. For now, the current solution is sufficient.
8406*/
8407#define FORMATBUFLEN (size_t)120
8408
Guido van Rossumd57fd912000-03-10 22:53:23 +00008409PyObject *PyUnicode_Format(PyObject *format,
8410 PyObject *args)
8411{
8412 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008413 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008414 int args_owned = 0;
8415 PyUnicodeObject *result = NULL;
8416 PyObject *dict = NULL;
8417 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008418
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419 if (format == NULL || args == NULL) {
8420 PyErr_BadInternalCall();
8421 return NULL;
8422 }
8423 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008424 if (uformat == NULL)
8425 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426 fmt = PyUnicode_AS_UNICODE(uformat);
8427 fmtcnt = PyUnicode_GET_SIZE(uformat);
8428
8429 reslen = rescnt = fmtcnt + 100;
8430 result = _PyUnicode_New(reslen);
8431 if (result == NULL)
8432 goto onError;
8433 res = PyUnicode_AS_UNICODE(result);
8434
8435 if (PyTuple_Check(args)) {
8436 arglen = PyTuple_Size(args);
8437 argidx = 0;
8438 }
8439 else {
8440 arglen = -1;
8441 argidx = -2;
8442 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008443 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008444 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445 dict = args;
8446
8447 while (--fmtcnt >= 0) {
8448 if (*fmt != '%') {
8449 if (--rescnt < 0) {
8450 rescnt = fmtcnt + 100;
8451 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008452 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008453 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008454 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8455 --rescnt;
8456 }
8457 *res++ = *fmt++;
8458 }
8459 else {
8460 /* Got a format specifier */
8461 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008462 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464 Py_UNICODE c = '\0';
8465 Py_UNICODE fill;
8466 PyObject *v = NULL;
8467 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008468 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008470 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008471 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472
8473 fmt++;
8474 if (*fmt == '(') {
8475 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008476 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477 PyObject *key;
8478 int pcount = 1;
8479
8480 if (dict == NULL) {
8481 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008482 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483 goto onError;
8484 }
8485 ++fmt;
8486 --fmtcnt;
8487 keystart = fmt;
8488 /* Skip over balanced parentheses */
8489 while (pcount > 0 && --fmtcnt >= 0) {
8490 if (*fmt == ')')
8491 --pcount;
8492 else if (*fmt == '(')
8493 ++pcount;
8494 fmt++;
8495 }
8496 keylen = fmt - keystart - 1;
8497 if (fmtcnt < 0 || pcount > 0) {
8498 PyErr_SetString(PyExc_ValueError,
8499 "incomplete format key");
8500 goto onError;
8501 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008502#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008503 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504 then looked up since Python uses strings to hold
8505 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008506 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507 key = PyUnicode_EncodeUTF8(keystart,
8508 keylen,
8509 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008510#else
8511 key = PyUnicode_FromUnicode(keystart, keylen);
8512#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008513 if (key == NULL)
8514 goto onError;
8515 if (args_owned) {
8516 Py_DECREF(args);
8517 args_owned = 0;
8518 }
8519 args = PyObject_GetItem(dict, key);
8520 Py_DECREF(key);
8521 if (args == NULL) {
8522 goto onError;
8523 }
8524 args_owned = 1;
8525 arglen = -1;
8526 argidx = -2;
8527 }
8528 while (--fmtcnt >= 0) {
8529 switch (c = *fmt++) {
8530 case '-': flags |= F_LJUST; continue;
8531 case '+': flags |= F_SIGN; continue;
8532 case ' ': flags |= F_BLANK; continue;
8533 case '#': flags |= F_ALT; continue;
8534 case '0': flags |= F_ZERO; continue;
8535 }
8536 break;
8537 }
8538 if (c == '*') {
8539 v = getnextarg(args, arglen, &argidx);
8540 if (v == NULL)
8541 goto onError;
8542 if (!PyInt_Check(v)) {
8543 PyErr_SetString(PyExc_TypeError,
8544 "* wants int");
8545 goto onError;
8546 }
8547 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008548 if (width == -1 && PyErr_Occurred())
8549 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550 if (width < 0) {
8551 flags |= F_LJUST;
8552 width = -width;
8553 }
8554 if (--fmtcnt >= 0)
8555 c = *fmt++;
8556 }
8557 else if (c >= '0' && c <= '9') {
8558 width = c - '0';
8559 while (--fmtcnt >= 0) {
8560 c = *fmt++;
8561 if (c < '0' || c > '9')
8562 break;
8563 if ((width*10) / 10 != width) {
8564 PyErr_SetString(PyExc_ValueError,
8565 "width too big");
8566 goto onError;
8567 }
8568 width = width*10 + (c - '0');
8569 }
8570 }
8571 if (c == '.') {
8572 prec = 0;
8573 if (--fmtcnt >= 0)
8574 c = *fmt++;
8575 if (c == '*') {
8576 v = getnextarg(args, arglen, &argidx);
8577 if (v == NULL)
8578 goto onError;
8579 if (!PyInt_Check(v)) {
8580 PyErr_SetString(PyExc_TypeError,
8581 "* wants int");
8582 goto onError;
8583 }
8584 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008585 if (prec == -1 && PyErr_Occurred())
8586 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587 if (prec < 0)
8588 prec = 0;
8589 if (--fmtcnt >= 0)
8590 c = *fmt++;
8591 }
8592 else if (c >= '0' && c <= '9') {
8593 prec = c - '0';
8594 while (--fmtcnt >= 0) {
8595 c = Py_CHARMASK(*fmt++);
8596 if (c < '0' || c > '9')
8597 break;
8598 if ((prec*10) / 10 != prec) {
8599 PyErr_SetString(PyExc_ValueError,
8600 "prec too big");
8601 goto onError;
8602 }
8603 prec = prec*10 + (c - '0');
8604 }
8605 }
8606 } /* prec */
8607 if (fmtcnt >= 0) {
8608 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609 if (--fmtcnt >= 0)
8610 c = *fmt++;
8611 }
8612 }
8613 if (fmtcnt < 0) {
8614 PyErr_SetString(PyExc_ValueError,
8615 "incomplete format");
8616 goto onError;
8617 }
8618 if (c != '%') {
8619 v = getnextarg(args, arglen, &argidx);
8620 if (v == NULL)
8621 goto onError;
8622 }
8623 sign = 0;
8624 fill = ' ';
8625 switch (c) {
8626
8627 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008628 pbuf = formatbuf;
8629 /* presume that buffer length is at least 1 */
8630 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631 len = 1;
8632 break;
8633
8634 case 's':
8635 case 'r':
8636 if (PyUnicode_Check(v) && c == 's') {
8637 temp = v;
8638 Py_INCREF(temp);
8639 }
8640 else {
8641 PyObject *unicode;
8642 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008643 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644 else
8645 temp = PyObject_Repr(v);
8646 if (temp == NULL)
8647 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008648 if (PyUnicode_Check(temp))
8649 /* nothing to do */;
8650 else if (PyString_Check(temp)) {
8651 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008652 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008654 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008656 Py_DECREF(temp);
8657 temp = unicode;
8658 if (temp == NULL)
8659 goto onError;
8660 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008661 else {
8662 Py_DECREF(temp);
8663 PyErr_SetString(PyExc_TypeError,
8664 "%s argument has non-string str()");
8665 goto onError;
8666 }
8667 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008668 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669 len = PyUnicode_GET_SIZE(temp);
8670 if (prec >= 0 && len > prec)
8671 len = prec;
8672 break;
8673
8674 case 'i':
8675 case 'd':
8676 case 'u':
8677 case 'o':
8678 case 'x':
8679 case 'X':
8680 if (c == 'i')
8681 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008682 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008683 temp = formatlong(v, flags, prec, c);
8684 if (!temp)
8685 goto onError;
8686 pbuf = PyUnicode_AS_UNICODE(temp);
8687 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008688 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008690 else {
8691 pbuf = formatbuf;
8692 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8693 flags, prec, c, v);
8694 if (len < 0)
8695 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008696 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008697 }
8698 if (flags & F_ZERO)
8699 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700 break;
8701
8702 case 'e':
8703 case 'E':
8704 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008705 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008706 case 'g':
8707 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008708 if (c == 'F')
8709 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008710 pbuf = formatbuf;
8711 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8712 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713 if (len < 0)
8714 goto onError;
8715 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008716 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717 fill = '0';
8718 break;
8719
8720 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008721 pbuf = formatbuf;
8722 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723 if (len < 0)
8724 goto onError;
8725 break;
8726
8727 default:
8728 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008729 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008730 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008731 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008732 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008733 (Py_ssize_t)(fmt - 1 -
8734 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735 goto onError;
8736 }
8737 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008738 if (*pbuf == '-' || *pbuf == '+') {
8739 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740 len--;
8741 }
8742 else if (flags & F_SIGN)
8743 sign = '+';
8744 else if (flags & F_BLANK)
8745 sign = ' ';
8746 else
8747 sign = 0;
8748 }
8749 if (width < len)
8750 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008751 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752 reslen -= rescnt;
8753 rescnt = width + fmtcnt + 100;
8754 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008755 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008756 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008757 PyErr_NoMemory();
8758 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008759 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008760 if (_PyUnicode_Resize(&result, reslen) < 0) {
8761 Py_XDECREF(temp);
8762 goto onError;
8763 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008764 res = PyUnicode_AS_UNICODE(result)
8765 + reslen - rescnt;
8766 }
8767 if (sign) {
8768 if (fill != ' ')
8769 *res++ = sign;
8770 rescnt--;
8771 if (width > len)
8772 width--;
8773 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008774 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008775 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008776 assert(pbuf[1] == c);
8777 if (fill != ' ') {
8778 *res++ = *pbuf++;
8779 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008780 }
Tim Petersfff53252001-04-12 18:38:48 +00008781 rescnt -= 2;
8782 width -= 2;
8783 if (width < 0)
8784 width = 0;
8785 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008786 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787 if (width > len && !(flags & F_LJUST)) {
8788 do {
8789 --rescnt;
8790 *res++ = fill;
8791 } while (--width > len);
8792 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008793 if (fill == ' ') {
8794 if (sign)
8795 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008796 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008797 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008798 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008799 *res++ = *pbuf++;
8800 *res++ = *pbuf++;
8801 }
8802 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008803 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008804 res += len;
8805 rescnt -= len;
8806 while (--width >= len) {
8807 --rescnt;
8808 *res++ = ' ';
8809 }
8810 if (dict && (argidx < arglen) && c != '%') {
8811 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008812 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008813 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814 goto onError;
8815 }
8816 Py_XDECREF(temp);
8817 } /* '%' */
8818 } /* until end */
8819 if (argidx < arglen && !dict) {
8820 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008821 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822 goto onError;
8823 }
8824
Thomas Woutersa96affe2006-03-12 00:29:36 +00008825 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8826 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008827 if (args_owned) {
8828 Py_DECREF(args);
8829 }
8830 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831 return (PyObject *)result;
8832
8833 onError:
8834 Py_XDECREF(result);
8835 Py_DECREF(uformat);
8836 if (args_owned) {
8837 Py_DECREF(args);
8838 }
8839 return NULL;
8840}
8841
Jeremy Hylton938ace62002-07-17 16:30:39 +00008842static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008843unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8844
Tim Peters6d6c1a32001-08-02 04:15:00 +00008845static PyObject *
8846unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8847{
8848 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008849 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008850 char *encoding = NULL;
8851 char *errors = NULL;
8852
Guido van Rossume023fe02001-08-30 03:12:59 +00008853 if (type != &PyUnicode_Type)
8854 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008855 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8856 kwlist, &x, &encoding, &errors))
8857 return NULL;
8858 if (x == NULL)
8859 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008860 if (encoding == NULL && errors == NULL)
8861 return PyObject_Unicode(x);
8862 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008863 return PyUnicode_FromEncodedObject(x, encoding, errors);
8864}
8865
Guido van Rossume023fe02001-08-30 03:12:59 +00008866static PyObject *
8867unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8868{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008869 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008870 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008871
8872 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8873 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8874 if (tmp == NULL)
8875 return NULL;
8876 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008877 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008878 if (pnew == NULL) {
8879 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008880 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008881 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008882 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8883 if (pnew->str == NULL) {
8884 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008885 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008886 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008887 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008888 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008889 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8890 pnew->length = n;
8891 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008892 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008893 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008894}
8895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008896PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008897"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008898\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008899Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008900encoding defaults to the current default string encoding.\n\
8901errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008902
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008903static PyObject *unicode_iter(PyObject *seq);
8904
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008906 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008907 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008908 sizeof(PyUnicodeObject), /* tp_size */
8909 0, /* tp_itemsize */
8910 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008911 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008913 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008914 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008915 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008916 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008917 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008918 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008919 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008920 (hashfunc) unicode_hash, /* tp_hash*/
8921 0, /* tp_call*/
8922 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008923 PyObject_GenericGetAttr, /* tp_getattro */
8924 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00008925 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008926 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8927 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008928 unicode_doc, /* tp_doc */
8929 0, /* tp_traverse */
8930 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008931 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008932 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008933 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008934 0, /* tp_iternext */
8935 unicode_methods, /* tp_methods */
8936 0, /* tp_members */
8937 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008938 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008939 0, /* tp_dict */
8940 0, /* tp_descr_get */
8941 0, /* tp_descr_set */
8942 0, /* tp_dictoffset */
8943 0, /* tp_init */
8944 0, /* tp_alloc */
8945 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008946 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947};
8948
8949/* Initialize the Unicode implementation */
8950
Thomas Wouters78890102000-07-22 19:25:51 +00008951void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008953 int i;
8954
Thomas Wouters477c8d52006-05-27 19:21:47 +00008955 /* XXX - move this array to unicodectype.c ? */
8956 Py_UNICODE linebreak[] = {
8957 0x000A, /* LINE FEED */
8958 0x000D, /* CARRIAGE RETURN */
8959 0x001C, /* FILE SEPARATOR */
8960 0x001D, /* GROUP SEPARATOR */
8961 0x001E, /* RECORD SEPARATOR */
8962 0x0085, /* NEXT LINE */
8963 0x2028, /* LINE SEPARATOR */
8964 0x2029, /* PARAGRAPH SEPARATOR */
8965 };
8966
Fred Drakee4315f52000-05-09 19:53:39 +00008967 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008968 unicode_freelist = NULL;
8969 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008971 if (!unicode_empty)
8972 return;
8973
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008974 for (i = 0; i < 256; i++)
8975 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008976 if (PyType_Ready(&PyUnicode_Type) < 0)
8977 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008978
8979 /* initialize the linebreak bloom filter */
8980 bloom_linebreak = make_bloom_mask(
8981 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8982 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008983
8984 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985}
8986
8987/* Finalize the Unicode implementation */
8988
8989void
Thomas Wouters78890102000-07-22 19:25:51 +00008990_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008992 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008993 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008995 Py_XDECREF(unicode_empty);
8996 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008997
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008998 for (i = 0; i < 256; i++) {
8999 if (unicode_latin1[i]) {
9000 Py_DECREF(unicode_latin1[i]);
9001 unicode_latin1[i] = NULL;
9002 }
9003 }
9004
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009005 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006 PyUnicodeObject *v = u;
9007 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00009008 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00009009 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00009010 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009011 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009013 unicode_freelist = NULL;
9014 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009016
Walter Dörwald16807132007-05-25 13:52:07 +00009017void
9018PyUnicode_InternInPlace(PyObject **p)
9019{
9020 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9021 PyObject *t;
9022 if (s == NULL || !PyUnicode_Check(s))
9023 Py_FatalError(
9024 "PyUnicode_InternInPlace: unicode strings only please!");
9025 /* If it's a subclass, we don't really know what putting
9026 it in the interned dict might do. */
9027 if (!PyUnicode_CheckExact(s))
9028 return;
9029 if (PyUnicode_CHECK_INTERNED(s))
9030 return;
9031 if (interned == NULL) {
9032 interned = PyDict_New();
9033 if (interned == NULL) {
9034 PyErr_Clear(); /* Don't leave an exception */
9035 return;
9036 }
9037 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009038 /* It might be that the GetItem call fails even
9039 though the key is present in the dictionary,
9040 namely when this happens during a stack overflow. */
9041 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009042 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009043 Py_END_ALLOW_RECURSION
9044
Walter Dörwald16807132007-05-25 13:52:07 +00009045 if (t) {
9046 Py_INCREF(t);
9047 Py_DECREF(*p);
9048 *p = t;
9049 return;
9050 }
9051
Martin v. Löwis5b222132007-06-10 09:51:05 +00009052 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009053 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9054 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009055 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009056 return;
9057 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009058 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009059 /* The two references in interned are not counted by refcnt.
9060 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009061 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009062 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9063}
9064
9065void
9066PyUnicode_InternImmortal(PyObject **p)
9067{
9068 PyUnicode_InternInPlace(p);
9069 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9070 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9071 Py_INCREF(*p);
9072 }
9073}
9074
9075PyObject *
9076PyUnicode_InternFromString(const char *cp)
9077{
9078 PyObject *s = PyUnicode_FromString(cp);
9079 if (s == NULL)
9080 return NULL;
9081 PyUnicode_InternInPlace(&s);
9082 return s;
9083}
9084
9085void _Py_ReleaseInternedUnicodeStrings(void)
9086{
9087 PyObject *keys;
9088 PyUnicodeObject *s;
9089 Py_ssize_t i, n;
9090 Py_ssize_t immortal_size = 0, mortal_size = 0;
9091
9092 if (interned == NULL || !PyDict_Check(interned))
9093 return;
9094 keys = PyDict_Keys(interned);
9095 if (keys == NULL || !PyList_Check(keys)) {
9096 PyErr_Clear();
9097 return;
9098 }
9099
9100 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9101 detector, interned unicode strings are not forcibly deallocated;
9102 rather, we give them their stolen references back, and then clear
9103 and DECREF the interned dict. */
9104
9105 n = PyList_GET_SIZE(keys);
9106 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9107 n);
9108 for (i = 0; i < n; i++) {
9109 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9110 switch (s->state) {
9111 case SSTATE_NOT_INTERNED:
9112 /* XXX Shouldn't happen */
9113 break;
9114 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009115 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009116 immortal_size += s->length;
9117 break;
9118 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009119 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009120 mortal_size += s->length;
9121 break;
9122 default:
9123 Py_FatalError("Inconsistent interned string state.");
9124 }
9125 s->state = SSTATE_NOT_INTERNED;
9126 }
9127 fprintf(stderr, "total size of all interned strings: "
9128 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9129 "mortal/immortal\n", mortal_size, immortal_size);
9130 Py_DECREF(keys);
9131 PyDict_Clear(interned);
9132 Py_DECREF(interned);
9133 interned = NULL;
9134}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009135
9136
9137/********************* Unicode Iterator **************************/
9138
9139typedef struct {
9140 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009141 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009142 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9143} unicodeiterobject;
9144
9145static void
9146unicodeiter_dealloc(unicodeiterobject *it)
9147{
9148 _PyObject_GC_UNTRACK(it);
9149 Py_XDECREF(it->it_seq);
9150 PyObject_GC_Del(it);
9151}
9152
9153static int
9154unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9155{
9156 Py_VISIT(it->it_seq);
9157 return 0;
9158}
9159
9160static PyObject *
9161unicodeiter_next(unicodeiterobject *it)
9162{
9163 PyUnicodeObject *seq;
9164 PyObject *item;
9165
9166 assert(it != NULL);
9167 seq = it->it_seq;
9168 if (seq == NULL)
9169 return NULL;
9170 assert(PyUnicode_Check(seq));
9171
9172 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009173 item = PyUnicode_FromUnicode(
9174 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009175 if (item != NULL)
9176 ++it->it_index;
9177 return item;
9178 }
9179
9180 Py_DECREF(seq);
9181 it->it_seq = NULL;
9182 return NULL;
9183}
9184
9185static PyObject *
9186unicodeiter_len(unicodeiterobject *it)
9187{
9188 Py_ssize_t len = 0;
9189 if (it->it_seq)
9190 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9191 return PyInt_FromSsize_t(len);
9192}
9193
9194PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9195
9196static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009197 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9198 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009199 {NULL, NULL} /* sentinel */
9200};
9201
9202PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009203 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009204 "unicodeiterator", /* tp_name */
9205 sizeof(unicodeiterobject), /* tp_basicsize */
9206 0, /* tp_itemsize */
9207 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009208 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009209 0, /* tp_print */
9210 0, /* tp_getattr */
9211 0, /* tp_setattr */
9212 0, /* tp_compare */
9213 0, /* tp_repr */
9214 0, /* tp_as_number */
9215 0, /* tp_as_sequence */
9216 0, /* tp_as_mapping */
9217 0, /* tp_hash */
9218 0, /* tp_call */
9219 0, /* tp_str */
9220 PyObject_GenericGetAttr, /* tp_getattro */
9221 0, /* tp_setattro */
9222 0, /* tp_as_buffer */
9223 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9224 0, /* tp_doc */
9225 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9226 0, /* tp_clear */
9227 0, /* tp_richcompare */
9228 0, /* tp_weaklistoffset */
9229 PyObject_SelfIter, /* tp_iter */
9230 (iternextfunc)unicodeiter_next, /* tp_iternext */
9231 unicodeiter_methods, /* tp_methods */
9232 0,
9233};
9234
9235static PyObject *
9236unicode_iter(PyObject *seq)
9237{
9238 unicodeiterobject *it;
9239
9240 if (!PyUnicode_Check(seq)) {
9241 PyErr_BadInternalCall();
9242 return NULL;
9243 }
9244 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9245 if (it == NULL)
9246 return NULL;
9247 it->it_index = 0;
9248 Py_INCREF(seq);
9249 it->it_seq = (PyUnicodeObject *)seq;
9250 _PyObject_GC_TRACK(it);
9251 return (PyObject *)it;
9252}
9253
Martin v. Löwis5b222132007-06-10 09:51:05 +00009254size_t
9255Py_UNICODE_strlen(const Py_UNICODE *u)
9256{
9257 int res = 0;
9258 while(*u++)
9259 res++;
9260 return res;
9261}
9262
9263Py_UNICODE*
9264Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9265{
9266 Py_UNICODE *u = s1;
9267 while ((*u++ = *s2++));
9268 return s1;
9269}
9270
9271Py_UNICODE*
9272Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9273{
9274 Py_UNICODE *u = s1;
9275 while ((*u++ = *s2++))
9276 if (n-- == 0)
9277 break;
9278 return s1;
9279}
9280
9281int
9282Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9283{
9284 while (*s1 && *s2 && *s1 == *s2)
9285 s1++, s2++;
9286 if (*s1 && *s2)
9287 return (*s1 < *s2) ? -1 : +1;
9288 if (*s1)
9289 return 1;
9290 if (*s2)
9291 return -1;
9292 return 0;
9293}
9294
9295Py_UNICODE*
9296Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9297{
9298 const Py_UNICODE *p;
9299 for (p = s; *p; p++)
9300 if (*p == c)
9301 return (Py_UNICODE*)p;
9302 return NULL;
9303}
9304
9305
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009306#ifdef __cplusplus
9307}
9308#endif
9309
9310
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009311/*
9312Local variables:
9313c-basic-offset: 4
9314indent-tabs-mode: nil
9315End:
9316*/